From ce117105fde1d27eb75caa84bd9f0a2f4ec5bfe5 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Sat, 2 May 2026 04:46:11 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: sampluralis/llama-sft-muon Source: Original Platform --- .gitattributes | 36 + README.md | 57 + all_results.json | 8 + chat_template.jinja | 96 + config.json | 35 + generation_config.json | 11 + model.safetensors | 3 + special_tokens_map.json | 11 + tokenizer.json | 3 + tokenizer_config.json | 2063 + train_results.json | 8 + trainer_state.json | 166453 +++++++++++++++++++++++++++++++++++++ training_args.bin | 3 + 13 files changed, 168787 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 all_results.json create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 train_results.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..1b079be --- /dev/null +++ b/README.md @@ -0,0 +1,57 @@ +--- +base_model: gshasiri/SmolLM3-Mid +library_name: transformers +model_name: llama-sft-muon +tags: +- generated_from_trainer +- trl +- sft +licence: license +--- + +# Model Card for llama-sft-muon + +This model is a fine-tuned version of [gshasiri/SmolLM3-Mid](https://huggingface.co/gshasiri/SmolLM3-Mid). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="sampluralis/llama-sft-muon", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/ajanthan-pluralis-research/huggingface/runs/yo30dbp4) + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.28.0 +- Transformers: 4.57.6 +- Pytorch: 2.6.0+cu126 +- Datasets: 4.6.0 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..48f71fc --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 5.519394781646763e+19, + "train_loss": 3.57993624316737, + "train_runtime": 147576.5591, + "train_samples": 1444084, + "train_samples_per_second": 1.002, + "train_steps_per_second": 0.125 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..b481759 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,96 @@ +{# ───── defaults ───── #} +{%- if enable_thinking is not defined -%} +{%- set enable_thinking = true -%} +{%- endif -%} + +{# ───── reasoning mode ───── #} +{%- if enable_thinking -%} + {%- set reasoning_mode = "/think" -%} +{%- else -%} + {%- set reasoning_mode = "/no_think" -%} +{%- endif -%} + +{# ───── header (system message) ───── #} +{{- "<|im_start|>system\n" -}} + +{%- if messages[0].role == "system" -%} + {%- set system_message = messages[0].content -%} + {%- if "/no_think" in system_message -%} + {%- set reasoning_mode = "/no_think" -%} + {%- elif "/think" in system_message -%} + {%- set reasoning_mode = "/think" -%} + {%- endif -%} + {%- set custom_instructions = system_message.replace("/no_think", "").replace("/think", "").rstrip() -%} +{%- endif -%} + +{%- if "/system_override" in system_message -%} + {{- custom_instructions.replace("/system_override", "").rstrip() -}} + {{- "<|im_end|>\n" -}} +{%- else -%} + {{- "## Metadata\n\n" -}} + {{- "Knowledge Cutoff Date: June 2025\n" -}} + {%- set today = strftime_now("%d %B %Y") -%} + {{- "Today Date: " ~ today ~ "\n" -}} + {{- "Reasoning Mode: " + reasoning_mode + "\n\n" -}} + + {{- "## Custom Instructions\n\n" -}} + {%- if custom_instructions -%} + {{- custom_instructions + "\n\n" -}} + {%- elif reasoning_mode == "/think" -%} + {{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracking, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: Thought section Solution section. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion.\n\n" -}} + {%- else -%} + {{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face.\n\n" -}} + {%- endif -%} + + {{- "## Tools\n\n" -}} + {{- "### XML Tools\n\n" -}} + {%- if tools -%} + {%- set ns = namespace(xml_tool_string="You may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n\n\n") -%} + {%- for tool in tools -%} + {%- set ns.xml_tool_string = ns.xml_tool_string ~ (tool | tojson) ~ "\n" -%} + {%- endfor -%} + {%- set xml_tools = ns.xml_tool_string + "\n\nFor each function call, return a json object with function name and arguments within XML tags." -%} + {%- endif -%} + {%- if xml_tools -%} + {{- xml_tools -}} + {%- else -%} + {{- "None" -}} + {%- endif -%} + {{- "\n\n" -}} + {{- "### Python Tools\n\n" -}} + {%- if python_tools -%} + {{- python_tools -}} + {%- else -%} + {{- "None" -}} + {%- endif -%} + {{- "\n\n" -}} + {{- "<|im_end|>\n" -}} +{%- endif -%} + +{# ───── main loop ───── #} +{%- for message in messages -%} + {%- set content = message.content if message.content is string else "" -%} + {%- if message.role == "user" -%} + {{ "<|im_start|>" + message.role + "\n" + content + "<|im_end|>\n" }} + {%- elif message.role == "assistant" -%} + {% generation %} + {%- if reasoning_mode == "/think" -%} + {{ "<|im_start|>assistant\n" + content.lstrip("\n") + "<|im_end|>\n" }} + {%- else -%} + {{ "<|im_start|>assistant\n" + "\n\n\n" + content.lstrip("\n") + "<|im_end|>\n" }} + {%- endif -%} + {% endgeneration %} + + {%- elif message.role == "tool" -%} + {{ "<|im_start|>" + "user\n" + content + "<|im_end|>\n" }} + {%- endif -%} +{%- endfor -%} + +{# ───── generation prompt ───── #} +{%- if add_generation_prompt -%} + {%- if reasoning_mode == "/think" -%} + {{ "<|im_start|>assistant\n" }} + {%- else -%} + {{ "<|im_start|>assistant\n" + "\n\n\n" }} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..6efbded --- /dev/null +++ b/config.json @@ -0,0 +1,35 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128012, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "transformers_version": "4.57.6", + "use_cache": false, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..a216973 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,11 @@ +{ + "_from_model_config": true, + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128012 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.57.6" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..8edee9d --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:981388251a81ca960b0729a7b30d5d1b60a9d36f4799d862ccdb6aebce9a1389 +size 2471645608 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..2ad12c7 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,11 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": "<|im_end|>", + "pad_token": "<|im_end|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..f342589 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e7c979daf2c715603b21e094ce7e032280b007311a070cdf98ed708c492d614 +size 17209792 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..636c7ef --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128014": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128015": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128016": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128017": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128018": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|im_end|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|im_end|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..48f71fc --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 5.519394781646763e+19, + "train_loss": 3.57993624316737, + "train_runtime": 147576.5591, + "train_samples": 1444084, + "train_samples_per_second": 1.002, + "train_steps_per_second": 0.125 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..685ab61 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,166453 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 18490, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00027041644131963225, + "grad_norm": 3.859375, + "learning_rate": 0.0, + "loss": 1.3756, + "mean_token_accuracy": 0.6463733911514282, + "num_tokens": 508614.0, + "step": 1 + }, + { + "epoch": 0.0005408328826392645, + "grad_norm": 4.59375, + "learning_rate": 3.603603603603604e-05, + "loss": 1.4468, + "mean_token_accuracy": 0.6429179906845093, + "num_tokens": 1000933.0, + "step": 2 + }, + { + "epoch": 0.0008112493239588967, + "grad_norm": 2.484375, + "learning_rate": 7.207207207207208e-05, + "loss": 1.2333, + "mean_token_accuracy": 0.6762370467185974, + "num_tokens": 1525119.0, + "step": 3 + }, + { + "epoch": 0.001081665765278529, + "grad_norm": 5.625, + "learning_rate": 0.00010810810810810812, + "loss": 1.3557, + "mean_token_accuracy": 0.671931803226471, + "num_tokens": 2040588.0, + "step": 4 + }, + { + "epoch": 0.0013520822065981612, + "grad_norm": 4.40625, + "learning_rate": 0.00014414414414414415, + "loss": 1.3718, + "mean_token_accuracy": 0.6454557776451111, + "num_tokens": 2518672.0, + "step": 5 + }, + { + "epoch": 0.0016224986479177934, + "grad_norm": 3.0625, + "learning_rate": 0.00018018018018018018, + "loss": 1.3676, + "mean_token_accuracy": 0.6537374258041382, + "num_tokens": 3042747.0, + "step": 6 + }, + { + "epoch": 0.0018929150892374256, + "grad_norm": 3.015625, + "learning_rate": 0.00021621621621621624, + "loss": 1.4042, + "mean_token_accuracy": 0.6471892595291138, + "num_tokens": 3566859.0, + "step": 7 + }, + { + "epoch": 0.002163331530557058, + "grad_norm": 2.703125, + "learning_rate": 0.00025225225225225225, + "loss": 1.3115, + "mean_token_accuracy": 0.6724572777748108, + "num_tokens": 4091138.0, + "step": 8 + }, + { + "epoch": 0.0024337479718766902, + "grad_norm": 2.84375, + "learning_rate": 0.0002882882882882883, + "loss": 1.3539, + "mean_token_accuracy": 0.655264675617218, + "num_tokens": 4615417.0, + "step": 9 + }, + { + "epoch": 0.0027041644131963224, + "grad_norm": 2.203125, + "learning_rate": 0.00032432432432432436, + "loss": 1.3195, + "mean_token_accuracy": 0.6568590402603149, + "num_tokens": 5139696.0, + "step": 10 + }, + { + "epoch": 0.0029745808545159546, + "grad_norm": 3.453125, + "learning_rate": 0.00036036036036036037, + "loss": 1.4144, + "mean_token_accuracy": 0.6557620763778687, + "num_tokens": 5601795.0, + "step": 11 + }, + { + "epoch": 0.003244997295835587, + "grad_norm": 55.5, + "learning_rate": 0.00039639639639639637, + "loss": 11.7015, + "mean_token_accuracy": 0.006492180284112692, + "num_tokens": 6070287.0, + "step": 12 + }, + { + "epoch": 0.003515413737155219, + "grad_norm": 61.75, + "learning_rate": 0.0004324324324324325, + "loss": 11.5978, + "mean_token_accuracy": 0.005790099501609802, + "num_tokens": 6594549.0, + "step": 13 + }, + { + "epoch": 0.0037858301784748512, + "grad_norm": 67.0, + "learning_rate": 0.0004684684684684685, + "loss": 11.3514, + "mean_token_accuracy": 0.005994436331093311, + "num_tokens": 7118826.0, + "step": 14 + }, + { + "epoch": 0.004056246619794483, + "grad_norm": 71.0, + "learning_rate": 0.0005045045045045045, + "loss": 11.2383, + "mean_token_accuracy": 0.006214096210896969, + "num_tokens": 7642988.0, + "step": 15 + }, + { + "epoch": 0.004326663061114116, + "grad_norm": 47.0, + "learning_rate": 0.0005405405405405405, + "loss": 11.5121, + "mean_token_accuracy": 0.009034290909767151, + "num_tokens": 8167205.0, + "step": 16 + }, + { + "epoch": 0.004597079502433748, + "grad_norm": 56.5, + "learning_rate": 0.0005765765765765766, + "loss": 11.1939, + "mean_token_accuracy": 0.008533889427781105, + "num_tokens": 8691425.0, + "step": 17 + }, + { + "epoch": 0.0048674959437533805, + "grad_norm": 54.0, + "learning_rate": 0.0006126126126126127, + "loss": 11.0753, + "mean_token_accuracy": 0.009354197420179844, + "num_tokens": 9215644.0, + "step": 18 + }, + { + "epoch": 0.005137912385073012, + "grad_norm": 52.25, + "learning_rate": 0.0006486486486486487, + "loss": 11.0392, + "mean_token_accuracy": 0.00892704539000988, + "num_tokens": 9727459.0, + "step": 19 + }, + { + "epoch": 0.005408328826392645, + "grad_norm": 48.0, + "learning_rate": 0.0006846846846846847, + "loss": 10.9387, + "mean_token_accuracy": 0.012390341609716415, + "num_tokens": 10251737.0, + "step": 20 + }, + { + "epoch": 0.005678745267712277, + "grad_norm": 46.5, + "learning_rate": 0.0007207207207207207, + "loss": 10.822, + "mean_token_accuracy": 0.010529263876378536, + "num_tokens": 10775964.0, + "step": 21 + }, + { + "epoch": 0.005949161709031909, + "grad_norm": 45.0, + "learning_rate": 0.0007567567567567568, + "loss": 10.7793, + "mean_token_accuracy": 0.012530284002423286, + "num_tokens": 11300165.0, + "step": 22 + }, + { + "epoch": 0.006219578150351541, + "grad_norm": 48.25, + "learning_rate": 0.0007927927927927927, + "loss": 10.4212, + "mean_token_accuracy": 0.011827011592686176, + "num_tokens": 11824394.0, + "step": 23 + }, + { + "epoch": 0.006489994591671174, + "grad_norm": 46.0, + "learning_rate": 0.0008288288288288288, + "loss": 10.3853, + "mean_token_accuracy": 0.012856585904955864, + "num_tokens": 12300420.0, + "step": 24 + }, + { + "epoch": 0.006760411032990805, + "grad_norm": 53.5, + "learning_rate": 0.000864864864864865, + "loss": 10.0173, + "mean_token_accuracy": 0.014139022678136826, + "num_tokens": 12820475.0, + "step": 25 + }, + { + "epoch": 0.007030827474310438, + "grad_norm": 48.75, + "learning_rate": 0.0009009009009009009, + "loss": 10.0244, + "mean_token_accuracy": 0.015511615201830864, + "num_tokens": 13344739.0, + "step": 26 + }, + { + "epoch": 0.007301243915630071, + "grad_norm": 46.0, + "learning_rate": 0.000936936936936937, + "loss": 10.1071, + "mean_token_accuracy": 0.016097208485007286, + "num_tokens": 13857369.0, + "step": 27 + }, + { + "epoch": 0.0075716603569497025, + "grad_norm": 43.0, + "learning_rate": 0.000972972972972973, + "loss": 9.892, + "mean_token_accuracy": 0.016042299568653107, + "num_tokens": 14381642.0, + "step": 28 + }, + { + "epoch": 0.007842076798269334, + "grad_norm": 43.5, + "learning_rate": 0.001009009009009009, + "loss": 9.8366, + "mean_token_accuracy": 0.016977433115243912, + "num_tokens": 14905801.0, + "step": 29 + }, + { + "epoch": 0.008112493239588967, + "grad_norm": 46.75, + "learning_rate": 0.001045045045045045, + "loss": 9.0205, + "mean_token_accuracy": 0.02161179669201374, + "num_tokens": 15351015.0, + "step": 30 + }, + { + "epoch": 0.0083829096809086, + "grad_norm": 12.1875, + "learning_rate": 0.001081081081081081, + "loss": 1.8703, + "mean_token_accuracy": 0.5682125091552734, + "num_tokens": 15839774.0, + "step": 31 + }, + { + "epoch": 0.008653326122228232, + "grad_norm": 39.0, + "learning_rate": 0.0011171171171171172, + "loss": 9.2512, + "mean_token_accuracy": 0.01997269317507744, + "num_tokens": 16364041.0, + "step": 32 + }, + { + "epoch": 0.008923742563547863, + "grad_norm": 32.25, + "learning_rate": 0.0011531531531531532, + "loss": 9.2244, + "mean_token_accuracy": 0.02161121368408203, + "num_tokens": 16888240.0, + "step": 33 + }, + { + "epoch": 0.009194159004867496, + "grad_norm": 30.0, + "learning_rate": 0.0011891891891891893, + "loss": 9.23, + "mean_token_accuracy": 0.020842980593442917, + "num_tokens": 17412330.0, + "step": 34 + }, + { + "epoch": 0.009464575446187128, + "grad_norm": 32.25, + "learning_rate": 0.0012252252252252253, + "loss": 8.6952, + "mean_token_accuracy": 0.02261316031217575, + "num_tokens": 17936498.0, + "step": 35 + }, + { + "epoch": 0.009734991887506761, + "grad_norm": 24.5, + "learning_rate": 0.0012612612612612612, + "loss": 8.9908, + "mean_token_accuracy": 0.022661831229925156, + "num_tokens": 18460615.0, + "step": 36 + }, + { + "epoch": 0.010005408328826392, + "grad_norm": 22.625, + "learning_rate": 0.0012972972972972974, + "loss": 8.6013, + "mean_token_accuracy": 0.023584000766277313, + "num_tokens": 18924415.0, + "step": 37 + }, + { + "epoch": 0.010275824770146024, + "grad_norm": 20.875, + "learning_rate": 0.0013333333333333333, + "loss": 8.9118, + "mean_token_accuracy": 0.026403255760669708, + "num_tokens": 19448668.0, + "step": 38 + }, + { + "epoch": 0.010546241211465657, + "grad_norm": 15.9375, + "learning_rate": 0.0013693693693693693, + "loss": 8.47, + "mean_token_accuracy": 0.026570983231067657, + "num_tokens": 19911783.0, + "step": 39 + }, + { + "epoch": 0.01081665765278529, + "grad_norm": 16.875, + "learning_rate": 0.0014054054054054054, + "loss": 8.2491, + "mean_token_accuracy": 0.025829242542386055, + "num_tokens": 20435913.0, + "step": 40 + }, + { + "epoch": 0.011087074094104922, + "grad_norm": 14.125, + "learning_rate": 0.0014414414414414415, + "loss": 7.9134, + "mean_token_accuracy": 0.030361132696270943, + "num_tokens": 20960135.0, + "step": 41 + }, + { + "epoch": 0.011357490535424553, + "grad_norm": 11.75, + "learning_rate": 0.0014774774774774773, + "loss": 8.0309, + "mean_token_accuracy": 0.028619473800063133, + "num_tokens": 21419709.0, + "step": 42 + }, + { + "epoch": 0.011627906976744186, + "grad_norm": 10.5, + "learning_rate": 0.0015135135135135136, + "loss": 8.3763, + "mean_token_accuracy": 0.031022315844893456, + "num_tokens": 21943872.0, + "step": 43 + }, + { + "epoch": 0.011898323418063819, + "grad_norm": 8.9375, + "learning_rate": 0.0015495495495495496, + "loss": 8.2549, + "mean_token_accuracy": 0.03206344693899155, + "num_tokens": 22468031.0, + "step": 44 + }, + { + "epoch": 0.012168739859383451, + "grad_norm": 8.875, + "learning_rate": 0.0015855855855855855, + "loss": 8.0586, + "mean_token_accuracy": 0.033094439655542374, + "num_tokens": 22992131.0, + "step": 45 + }, + { + "epoch": 0.012439156300703082, + "grad_norm": 8.875, + "learning_rate": 0.0016216216216216218, + "loss": 8.4557, + "mean_token_accuracy": 0.032964542508125305, + "num_tokens": 23461592.0, + "step": 46 + }, + { + "epoch": 0.012709572742022715, + "grad_norm": 8.5, + "learning_rate": 0.0016576576576576576, + "loss": 8.4045, + "mean_token_accuracy": 0.030144980177283287, + "num_tokens": 23975510.0, + "step": 47 + }, + { + "epoch": 0.012979989183342347, + "grad_norm": 8.0625, + "learning_rate": 0.0016936936936936937, + "loss": 7.7087, + "mean_token_accuracy": 0.037450823932886124, + "num_tokens": 24499651.0, + "step": 48 + }, + { + "epoch": 0.01325040562466198, + "grad_norm": 8.1875, + "learning_rate": 0.00172972972972973, + "loss": 7.8719, + "mean_token_accuracy": 0.03247333690524101, + "num_tokens": 24980867.0, + "step": 49 + }, + { + "epoch": 0.01352082206598161, + "grad_norm": 6.4375, + "learning_rate": 0.0017657657657657658, + "loss": 7.8261, + "mean_token_accuracy": 0.03638915345072746, + "num_tokens": 25483413.0, + "step": 50 + }, + { + "epoch": 0.013791238507301243, + "grad_norm": 29.125, + "learning_rate": 0.0018018018018018018, + "loss": 2.1047, + "mean_token_accuracy": 0.5213767290115356, + "num_tokens": 25996420.0, + "step": 51 + }, + { + "epoch": 0.014061654948620876, + "grad_norm": 5.65625, + "learning_rate": 0.0018378378378378379, + "loss": 7.5875, + "mean_token_accuracy": 0.043139055371284485, + "num_tokens": 26520681.0, + "step": 52 + }, + { + "epoch": 0.014332071389940509, + "grad_norm": 5.9375, + "learning_rate": 0.001873873873873874, + "loss": 8.07, + "mean_token_accuracy": 0.034927383065223694, + "num_tokens": 27044943.0, + "step": 53 + }, + { + "epoch": 0.014602487831260141, + "grad_norm": 5.96875, + "learning_rate": 0.0019099099099099098, + "loss": 7.7268, + "mean_token_accuracy": 0.03868185728788376, + "num_tokens": 27507212.0, + "step": 54 + }, + { + "epoch": 0.014872904272579772, + "grad_norm": 5.90625, + "learning_rate": 0.001945945945945946, + "loss": 7.4118, + "mean_token_accuracy": 0.04073121398687363, + "num_tokens": 28031393.0, + "step": 55 + }, + { + "epoch": 0.015143320713899405, + "grad_norm": 5.90625, + "learning_rate": 0.001981981981981982, + "loss": 7.782, + "mean_token_accuracy": 0.04255130887031555, + "num_tokens": 28504329.0, + "step": 56 + }, + { + "epoch": 0.015413737155219038, + "grad_norm": 4.8125, + "learning_rate": 0.002018018018018018, + "loss": 7.4961, + "mean_token_accuracy": 0.048983313143253326, + "num_tokens": 28996545.0, + "step": 57 + }, + { + "epoch": 0.01568415359653867, + "grad_norm": 4.78125, + "learning_rate": 0.0020540540540540542, + "loss": 7.5331, + "mean_token_accuracy": 0.0468372106552124, + "num_tokens": 29520677.0, + "step": 58 + }, + { + "epoch": 0.0159545700378583, + "grad_norm": 5.8125, + "learning_rate": 0.00209009009009009, + "loss": 7.1944, + "mean_token_accuracy": 0.04862525314092636, + "num_tokens": 30044764.0, + "step": 59 + }, + { + "epoch": 0.016224986479177934, + "grad_norm": 3.96875, + "learning_rate": 0.002126126126126126, + "loss": 7.4974, + "mean_token_accuracy": 0.057477470487356186, + "num_tokens": 30568995.0, + "step": 60 + }, + { + "epoch": 0.016495402920497566, + "grad_norm": 4.25, + "learning_rate": 0.002162162162162162, + "loss": 7.406, + "mean_token_accuracy": 0.052003391087055206, + "num_tokens": 31093146.0, + "step": 61 + }, + { + "epoch": 0.0167658193618172, + "grad_norm": 5.1875, + "learning_rate": 0.0021981981981981985, + "loss": 7.6605, + "mean_token_accuracy": 0.048209577798843384, + "num_tokens": 31617423.0, + "step": 62 + }, + { + "epoch": 0.01703623580313683, + "grad_norm": 6.75, + "learning_rate": 0.0022342342342342343, + "loss": 7.8237, + "mean_token_accuracy": 0.04746171832084656, + "num_tokens": 32092291.0, + "step": 63 + }, + { + "epoch": 0.017306652244456464, + "grad_norm": 5.4375, + "learning_rate": 0.0022702702702702706, + "loss": 7.0595, + "mean_token_accuracy": 0.058248914778232574, + "num_tokens": 32599776.0, + "step": 64 + }, + { + "epoch": 0.017577068685776097, + "grad_norm": 5.65625, + "learning_rate": 0.0023063063063063064, + "loss": 7.2216, + "mean_token_accuracy": 0.05871368572115898, + "num_tokens": 33100260.0, + "step": 65 + }, + { + "epoch": 0.017847485127095726, + "grad_norm": 5.40625, + "learning_rate": 0.0023423423423423423, + "loss": 7.4625, + "mean_token_accuracy": 0.060878366231918335, + "num_tokens": 33624511.0, + "step": 66 + }, + { + "epoch": 0.01811790156841536, + "grad_norm": 5.03125, + "learning_rate": 0.0023783783783783785, + "loss": 7.0026, + "mean_token_accuracy": 0.06015501916408539, + "num_tokens": 34148775.0, + "step": 67 + }, + { + "epoch": 0.01838831800973499, + "grad_norm": 4.8125, + "learning_rate": 0.0024144144144144144, + "loss": 7.248, + "mean_token_accuracy": 0.05763985961675644, + "num_tokens": 34673009.0, + "step": 68 + }, + { + "epoch": 0.018658734451054624, + "grad_norm": 11.5, + "learning_rate": 0.0024504504504504507, + "loss": 6.8716, + "mean_token_accuracy": 0.08775345236063004, + "num_tokens": 35139952.0, + "step": 69 + }, + { + "epoch": 0.018929150892374257, + "grad_norm": 7.8125, + "learning_rate": 0.0024864864864864865, + "loss": 6.9779, + "mean_token_accuracy": 0.068984255194664, + "num_tokens": 35664048.0, + "step": 70 + }, + { + "epoch": 0.01919956733369389, + "grad_norm": 47.75, + "learning_rate": 0.0025225225225225223, + "loss": 3.0369, + "mean_token_accuracy": 0.4360913932323456, + "num_tokens": 36188240.0, + "step": 71 + }, + { + "epoch": 0.019469983775013522, + "grad_norm": 6.53125, + "learning_rate": 0.0025585585585585586, + "loss": 6.6632, + "mean_token_accuracy": 0.06731884181499481, + "num_tokens": 36712521.0, + "step": 72 + }, + { + "epoch": 0.019740400216333154, + "grad_norm": 5.65625, + "learning_rate": 0.002594594594594595, + "loss": 7.0835, + "mean_token_accuracy": 0.0699826329946518, + "num_tokens": 37236741.0, + "step": 73 + }, + { + "epoch": 0.020010816657652784, + "grad_norm": 4.625, + "learning_rate": 0.0026306306306306303, + "loss": 6.6772, + "mean_token_accuracy": 0.07390651106834412, + "num_tokens": 37760978.0, + "step": 74 + }, + { + "epoch": 0.020281233098972416, + "grad_norm": 8.3125, + "learning_rate": 0.0026666666666666666, + "loss": 6.4784, + "mean_token_accuracy": 0.08860579133033752, + "num_tokens": 38285152.0, + "step": 75 + }, + { + "epoch": 0.02055164954029205, + "grad_norm": 6.4375, + "learning_rate": 0.002702702702702703, + "loss": 6.6504, + "mean_token_accuracy": 0.08122272789478302, + "num_tokens": 38796738.0, + "step": 76 + }, + { + "epoch": 0.02082206598161168, + "grad_norm": 4.34375, + "learning_rate": 0.0027387387387387387, + "loss": 6.948, + "mean_token_accuracy": 0.07741475850343704, + "num_tokens": 39320779.0, + "step": 77 + }, + { + "epoch": 0.021092482422931314, + "grad_norm": 4.15625, + "learning_rate": 0.002774774774774775, + "loss": 6.7969, + "mean_token_accuracy": 0.08272359520196915, + "num_tokens": 39845036.0, + "step": 78 + }, + { + "epoch": 0.021362898864250947, + "grad_norm": 3.84375, + "learning_rate": 0.002810810810810811, + "loss": 6.7633, + "mean_token_accuracy": 0.08368474245071411, + "num_tokens": 40369258.0, + "step": 79 + }, + { + "epoch": 0.02163331530557058, + "grad_norm": 9.25, + "learning_rate": 0.0028468468468468467, + "loss": 6.1429, + "mean_token_accuracy": 0.09644342958927155, + "num_tokens": 40846724.0, + "step": 80 + }, + { + "epoch": 0.021903731746890212, + "grad_norm": 5.5625, + "learning_rate": 0.002882882882882883, + "loss": 6.5381, + "mean_token_accuracy": 0.09120111167430878, + "num_tokens": 41370998.0, + "step": 81 + }, + { + "epoch": 0.022174148188209845, + "grad_norm": 5.15625, + "learning_rate": 0.002918918918918919, + "loss": 6.5302, + "mean_token_accuracy": 0.09361293911933899, + "num_tokens": 41865680.0, + "step": 82 + }, + { + "epoch": 0.022444564629529474, + "grad_norm": 5.875, + "learning_rate": 0.0029549549549549546, + "loss": 5.9492, + "mean_token_accuracy": 0.09277036786079407, + "num_tokens": 42389849.0, + "step": 83 + }, + { + "epoch": 0.022714981070849107, + "grad_norm": 7.125, + "learning_rate": 0.002990990990990991, + "loss": 6.1477, + "mean_token_accuracy": 0.10488244146108627, + "num_tokens": 42914056.0, + "step": 84 + }, + { + "epoch": 0.02298539751216874, + "grad_norm": 9.5, + "learning_rate": 0.003027027027027027, + "loss": 5.5629, + "mean_token_accuracy": 0.10988132655620575, + "num_tokens": 43438299.0, + "step": 85 + }, + { + "epoch": 0.023255813953488372, + "grad_norm": 8.4375, + "learning_rate": 0.003063063063063063, + "loss": 6.078, + "mean_token_accuracy": 0.1119377613067627, + "num_tokens": 43950415.0, + "step": 86 + }, + { + "epoch": 0.023526230394808004, + "grad_norm": 4.5625, + "learning_rate": 0.0030990990990990993, + "loss": 6.2663, + "mean_token_accuracy": 0.11282890290021896, + "num_tokens": 44474597.0, + "step": 87 + }, + { + "epoch": 0.023796646836127637, + "grad_norm": 5.875, + "learning_rate": 0.0031351351351351356, + "loss": 6.1067, + "mean_token_accuracy": 0.11476974189281464, + "num_tokens": 44998811.0, + "step": 88 + }, + { + "epoch": 0.02406706327744727, + "grad_norm": 9.5, + "learning_rate": 0.003171171171171171, + "loss": 5.9706, + "mean_token_accuracy": 0.12156268954277039, + "num_tokens": 45514485.0, + "step": 89 + }, + { + "epoch": 0.024337479718766902, + "grad_norm": 11.125, + "learning_rate": 0.0032072072072072072, + "loss": 5.5913, + "mean_token_accuracy": 0.1218656525015831, + "num_tokens": 46038748.0, + "step": 90 + }, + { + "epoch": 0.024607896160086535, + "grad_norm": 27.125, + "learning_rate": 0.0032432432432432435, + "loss": 3.052, + "mean_token_accuracy": 0.38376349210739136, + "num_tokens": 46562941.0, + "step": 91 + }, + { + "epoch": 0.024878312601406164, + "grad_norm": 8.1875, + "learning_rate": 0.0032792792792792794, + "loss": 6.2614, + "mean_token_accuracy": 0.11843432486057281, + "num_tokens": 47051029.0, + "step": 92 + }, + { + "epoch": 0.025148729042725797, + "grad_norm": 6.9375, + "learning_rate": 0.003315315315315315, + "loss": 5.8939, + "mean_token_accuracy": 0.12761126458644867, + "num_tokens": 47527666.0, + "step": 93 + }, + { + "epoch": 0.02541914548404543, + "grad_norm": 5.53125, + "learning_rate": 0.0033513513513513515, + "loss": 5.9153, + "mean_token_accuracy": 0.13580116629600525, + "num_tokens": 48051797.0, + "step": 94 + }, + { + "epoch": 0.025689561925365062, + "grad_norm": 7.5625, + "learning_rate": 0.0033873873873873873, + "loss": 5.7838, + "mean_token_accuracy": 0.13662835955619812, + "num_tokens": 48576075.0, + "step": 95 + }, + { + "epoch": 0.025959978366684695, + "grad_norm": 6.9375, + "learning_rate": 0.0034234234234234236, + "loss": 5.8747, + "mean_token_accuracy": 0.13031980395317078, + "num_tokens": 49100324.0, + "step": 96 + }, + { + "epoch": 0.026230394808004327, + "grad_norm": 5.4375, + "learning_rate": 0.00345945945945946, + "loss": 5.7469, + "mean_token_accuracy": 0.14046166837215424, + "num_tokens": 49624600.0, + "step": 97 + }, + { + "epoch": 0.02650081124932396, + "grad_norm": 6.4375, + "learning_rate": 0.0034954954954954953, + "loss": 5.7985, + "mean_token_accuracy": 0.13971208035945892, + "num_tokens": 50148780.0, + "step": 98 + }, + { + "epoch": 0.026771227690643593, + "grad_norm": 6.78125, + "learning_rate": 0.0035315315315315315, + "loss": 5.2869, + "mean_token_accuracy": 0.15723374485969543, + "num_tokens": 50614598.0, + "step": 99 + }, + { + "epoch": 0.02704164413196322, + "grad_norm": 4.71875, + "learning_rate": 0.003567567567567568, + "loss": 5.7094, + "mean_token_accuracy": 0.14368551969528198, + "num_tokens": 51138863.0, + "step": 100 + }, + { + "epoch": 0.027312060573282854, + "grad_norm": 5.59375, + "learning_rate": 0.0036036036036036037, + "loss": 5.6781, + "mean_token_accuracy": 0.15027478337287903, + "num_tokens": 51663135.0, + "step": 101 + }, + { + "epoch": 0.027582477014602487, + "grad_norm": 6.34375, + "learning_rate": 0.00363963963963964, + "loss": 5.6648, + "mean_token_accuracy": 0.14982259273529053, + "num_tokens": 52187304.0, + "step": 102 + }, + { + "epoch": 0.02785289345592212, + "grad_norm": 7.875, + "learning_rate": 0.0036756756756756758, + "loss": 5.3723, + "mean_token_accuracy": 0.1609998494386673, + "num_tokens": 52711492.0, + "step": 103 + }, + { + "epoch": 0.028123309897241752, + "grad_norm": 6.65625, + "learning_rate": 0.0037117117117117116, + "loss": 5.4166, + "mean_token_accuracy": 0.1791379600763321, + "num_tokens": 53235740.0, + "step": 104 + }, + { + "epoch": 0.028393726338561385, + "grad_norm": 10.5625, + "learning_rate": 0.003747747747747748, + "loss": 5.2074, + "mean_token_accuracy": 0.16634708642959595, + "num_tokens": 53760018.0, + "step": 105 + }, + { + "epoch": 0.028664142779881017, + "grad_norm": 24.25, + "learning_rate": 0.003783783783783784, + "loss": 5.3185, + "mean_token_accuracy": 0.20518982410430908, + "num_tokens": 54198401.0, + "step": 106 + }, + { + "epoch": 0.02893455922120065, + "grad_norm": 10.375, + "learning_rate": 0.0038198198198198196, + "loss": 4.9665, + "mean_token_accuracy": 0.17331624031066895, + "num_tokens": 54722669.0, + "step": 107 + }, + { + "epoch": 0.029204975662520283, + "grad_norm": 8.875, + "learning_rate": 0.003855855855855856, + "loss": 5.3855, + "mean_token_accuracy": 0.1573399305343628, + "num_tokens": 55246946.0, + "step": 108 + }, + { + "epoch": 0.029475392103839912, + "grad_norm": 8.25, + "learning_rate": 0.003891891891891892, + "loss": 4.9879, + "mean_token_accuracy": 0.16905425488948822, + "num_tokens": 55771163.0, + "step": 109 + }, + { + "epoch": 0.029745808545159545, + "grad_norm": 5.5, + "learning_rate": 0.0039279279279279275, + "loss": 5.3442, + "mean_token_accuracy": 0.18036916851997375, + "num_tokens": 56267267.0, + "step": 110 + }, + { + "epoch": 0.030016224986479177, + "grad_norm": 66.5, + "learning_rate": 0.003963963963963964, + "loss": 6.8841, + "mean_token_accuracy": 0.2020874172449112, + "num_tokens": 56791546.0, + "step": 111 + }, + { + "epoch": 0.03028664142779881, + "grad_norm": 11.0, + "learning_rate": 0.004, + "loss": 5.41, + "mean_token_accuracy": 0.15774443745613098, + "num_tokens": 57315828.0, + "step": 112 + }, + { + "epoch": 0.030557057869118442, + "grad_norm": 5.28125, + "learning_rate": 0.004036036036036036, + "loss": 5.359, + "mean_token_accuracy": 0.17555084824562073, + "num_tokens": 57749330.0, + "step": 113 + }, + { + "epoch": 0.030827474310438075, + "grad_norm": 6.625, + "learning_rate": 0.004072072072072072, + "loss": 5.156, + "mean_token_accuracy": 0.1915835738182068, + "num_tokens": 58273587.0, + "step": 114 + }, + { + "epoch": 0.031097890751757708, + "grad_norm": 10.0, + "learning_rate": 0.0041081081081081085, + "loss": 5.1424, + "mean_token_accuracy": 0.197003573179245, + "num_tokens": 58749554.0, + "step": 115 + }, + { + "epoch": 0.03136830719307734, + "grad_norm": 10.1875, + "learning_rate": 0.004144144144144144, + "loss": 4.9125, + "mean_token_accuracy": 0.1880778670310974, + "num_tokens": 59273753.0, + "step": 116 + }, + { + "epoch": 0.03163872363439697, + "grad_norm": 13.875, + "learning_rate": 0.00418018018018018, + "loss": 4.7772, + "mean_token_accuracy": 0.20651176571846008, + "num_tokens": 59714097.0, + "step": 117 + }, + { + "epoch": 0.0319091400757166, + "grad_norm": 7.875, + "learning_rate": 0.004216216216216217, + "loss": 5.0562, + "mean_token_accuracy": 0.18882489204406738, + "num_tokens": 60238357.0, + "step": 118 + }, + { + "epoch": 0.03217955651703624, + "grad_norm": 12.6875, + "learning_rate": 0.004252252252252252, + "loss": 4.954, + "mean_token_accuracy": 0.19084346294403076, + "num_tokens": 60762628.0, + "step": 119 + }, + { + "epoch": 0.03244997295835587, + "grad_norm": 12.4375, + "learning_rate": 0.0042882882882882885, + "loss": 4.4969, + "mean_token_accuracy": 0.17497316002845764, + "num_tokens": 61286559.0, + "step": 120 + }, + { + "epoch": 0.032720389399675504, + "grad_norm": 8.5625, + "learning_rate": 0.004324324324324324, + "loss": 4.7009, + "mean_token_accuracy": 0.2128652185201645, + "num_tokens": 61807315.0, + "step": 121 + }, + { + "epoch": 0.03299080584099513, + "grad_norm": 4.78125, + "learning_rate": 0.00436036036036036, + "loss": 4.8606, + "mean_token_accuracy": 0.20402294397354126, + "num_tokens": 62331480.0, + "step": 122 + }, + { + "epoch": 0.03326122228231476, + "grad_norm": 6.03125, + "learning_rate": 0.004396396396396397, + "loss": 4.8349, + "mean_token_accuracy": 0.20275267958641052, + "num_tokens": 62855742.0, + "step": 123 + }, + { + "epoch": 0.0335316387236344, + "grad_norm": 6.75, + "learning_rate": 0.004432432432432433, + "loss": 4.5134, + "mean_token_accuracy": 0.2195475697517395, + "num_tokens": 63380025.0, + "step": 124 + }, + { + "epoch": 0.03380205516495403, + "grad_norm": 6.84375, + "learning_rate": 0.004468468468468469, + "loss": 4.8437, + "mean_token_accuracy": 0.2138422727584839, + "num_tokens": 63864011.0, + "step": 125 + }, + { + "epoch": 0.03407247160627366, + "grad_norm": 7.46875, + "learning_rate": 0.0045045045045045045, + "loss": 4.5747, + "mean_token_accuracy": 0.21149061620235443, + "num_tokens": 64384918.0, + "step": 126 + }, + { + "epoch": 0.03434288804759329, + "grad_norm": 5.4375, + "learning_rate": 0.004540540540540541, + "loss": 4.8425, + "mean_token_accuracy": 0.21248796582221985, + "num_tokens": 64873812.0, + "step": 127 + }, + { + "epoch": 0.03461330448891293, + "grad_norm": 6.78125, + "learning_rate": 0.004576576576576576, + "loss": 4.8197, + "mean_token_accuracy": 0.20597991347312927, + "num_tokens": 65398077.0, + "step": 128 + }, + { + "epoch": 0.03488372093023256, + "grad_norm": 6.375, + "learning_rate": 0.004612612612612613, + "loss": 4.7996, + "mean_token_accuracy": 0.19869592785835266, + "num_tokens": 65922300.0, + "step": 129 + }, + { + "epoch": 0.035154137371552194, + "grad_norm": 7.0625, + "learning_rate": 0.004648648648648649, + "loss": 4.7352, + "mean_token_accuracy": 0.2333640456199646, + "num_tokens": 66393234.0, + "step": 130 + }, + { + "epoch": 0.03542455381287182, + "grad_norm": 55.5, + "learning_rate": 0.0046846846846846845, + "loss": 7.2685, + "mean_token_accuracy": 0.06730349361896515, + "num_tokens": 66854580.0, + "step": 131 + }, + { + "epoch": 0.03569497025419145, + "grad_norm": 15.625, + "learning_rate": 0.004720720720720721, + "loss": 4.6356, + "mean_token_accuracy": 0.21241170167922974, + "num_tokens": 67358258.0, + "step": 132 + }, + { + "epoch": 0.03596538669551109, + "grad_norm": 21.5, + "learning_rate": 0.004756756756756757, + "loss": 4.5796, + "mean_token_accuracy": 0.16847950220108032, + "num_tokens": 67882486.0, + "step": 133 + }, + { + "epoch": 0.03623580313683072, + "grad_norm": 5.96875, + "learning_rate": 0.004792792792792793, + "loss": 4.7103, + "mean_token_accuracy": 0.21743236482143402, + "num_tokens": 68406650.0, + "step": 134 + }, + { + "epoch": 0.03650621957815035, + "grad_norm": 5.96875, + "learning_rate": 0.004828828828828829, + "loss": 4.512, + "mean_token_accuracy": 0.2317867875099182, + "num_tokens": 68930907.0, + "step": 135 + }, + { + "epoch": 0.03677663601946998, + "grad_norm": 6.4375, + "learning_rate": 0.0048648648648648655, + "loss": 4.5286, + "mean_token_accuracy": 0.2248746156692505, + "num_tokens": 69455139.0, + "step": 136 + }, + { + "epoch": 0.03704705246078962, + "grad_norm": 6.1875, + "learning_rate": 0.004900900900900901, + "loss": 4.5125, + "mean_token_accuracy": 0.23178859055042267, + "num_tokens": 69979379.0, + "step": 137 + }, + { + "epoch": 0.03731746890210925, + "grad_norm": 7.375, + "learning_rate": 0.004936936936936937, + "loss": 4.4465, + "mean_token_accuracy": 0.21927237510681152, + "num_tokens": 70503618.0, + "step": 138 + }, + { + "epoch": 0.03758788534342888, + "grad_norm": 7.59375, + "learning_rate": 0.004972972972972973, + "loss": 4.3016, + "mean_token_accuracy": 0.23671278357505798, + "num_tokens": 71027714.0, + "step": 139 + }, + { + "epoch": 0.03785830178474851, + "grad_norm": 8.5, + "learning_rate": 0.005009009009009009, + "loss": 4.6556, + "mean_token_accuracy": 0.2249819040298462, + "num_tokens": 71551965.0, + "step": 140 + }, + { + "epoch": 0.03812871822606814, + "grad_norm": 8.4375, + "learning_rate": 0.005045045045045045, + "loss": 4.4745, + "mean_token_accuracy": 0.2355038821697235, + "num_tokens": 72076105.0, + "step": 141 + }, + { + "epoch": 0.03839913466738778, + "grad_norm": 6.3125, + "learning_rate": 0.005081081081081081, + "loss": 4.4701, + "mean_token_accuracy": 0.24281296133995056, + "num_tokens": 72600282.0, + "step": 142 + }, + { + "epoch": 0.03866955110870741, + "grad_norm": 7.03125, + "learning_rate": 0.005117117117117117, + "loss": 4.3564, + "mean_token_accuracy": 0.2480190098285675, + "num_tokens": 73093014.0, + "step": 143 + }, + { + "epoch": 0.038939967550027044, + "grad_norm": 6.875, + "learning_rate": 0.005153153153153153, + "loss": 4.25, + "mean_token_accuracy": 0.24712172150611877, + "num_tokens": 73617196.0, + "step": 144 + }, + { + "epoch": 0.03921038399134667, + "grad_norm": 7.65625, + "learning_rate": 0.00518918918918919, + "loss": 4.415, + "mean_token_accuracy": 0.23185965418815613, + "num_tokens": 74141271.0, + "step": 145 + }, + { + "epoch": 0.03948080043266631, + "grad_norm": 6.75, + "learning_rate": 0.005225225225225226, + "loss": 4.2763, + "mean_token_accuracy": 0.25648486614227295, + "num_tokens": 74665363.0, + "step": 146 + }, + { + "epoch": 0.03975121687398594, + "grad_norm": 8.5, + "learning_rate": 0.005261261261261261, + "loss": 4.233, + "mean_token_accuracy": 0.24098899960517883, + "num_tokens": 75189459.0, + "step": 147 + }, + { + "epoch": 0.04002163331530557, + "grad_norm": 6.0, + "learning_rate": 0.005297297297297297, + "loss": 4.2988, + "mean_token_accuracy": 0.2175518423318863, + "num_tokens": 75713670.0, + "step": 148 + }, + { + "epoch": 0.0402920497566252, + "grad_norm": 7.0625, + "learning_rate": 0.005333333333333333, + "loss": 3.9065, + "mean_token_accuracy": 0.25760161876678467, + "num_tokens": 76237893.0, + "step": 149 + }, + { + "epoch": 0.04056246619794483, + "grad_norm": 6.125, + "learning_rate": 0.005369369369369369, + "loss": 4.1138, + "mean_token_accuracy": 0.24665942788124084, + "num_tokens": 76728232.0, + "step": 150 + }, + { + "epoch": 0.04083288263926447, + "grad_norm": 104.0, + "learning_rate": 0.005405405405405406, + "loss": 12.4025, + "mean_token_accuracy": 0.010602214373648167, + "num_tokens": 77188771.0, + "step": 151 + }, + { + "epoch": 0.0411032990805841, + "grad_norm": 13.125, + "learning_rate": 0.0054414414414414415, + "loss": 4.5004, + "mean_token_accuracy": 0.2346687614917755, + "num_tokens": 77713017.0, + "step": 152 + }, + { + "epoch": 0.041373715521903734, + "grad_norm": 3.515625, + "learning_rate": 0.005477477477477477, + "loss": 4.3636, + "mean_token_accuracy": 0.25512564182281494, + "num_tokens": 78237297.0, + "step": 153 + }, + { + "epoch": 0.04164413196322336, + "grad_norm": 6.46875, + "learning_rate": 0.005513513513513514, + "loss": 4.1042, + "mean_token_accuracy": 0.2504834532737732, + "num_tokens": 78761524.0, + "step": 154 + }, + { + "epoch": 0.041914548404543, + "grad_norm": 9.5, + "learning_rate": 0.00554954954954955, + "loss": 4.2679, + "mean_token_accuracy": 0.25243449211120605, + "num_tokens": 79285793.0, + "step": 155 + }, + { + "epoch": 0.04218496484586263, + "grad_norm": 5.625, + "learning_rate": 0.005585585585585585, + "loss": 4.1692, + "mean_token_accuracy": 0.25159764289855957, + "num_tokens": 79810071.0, + "step": 156 + }, + { + "epoch": 0.04245538128718226, + "grad_norm": 7.96875, + "learning_rate": 0.005621621621621622, + "loss": 4.4202, + "mean_token_accuracy": 0.23541758954524994, + "num_tokens": 80334336.0, + "step": 157 + }, + { + "epoch": 0.042725797728501894, + "grad_norm": 5.0625, + "learning_rate": 0.0056576576576576575, + "loss": 3.9099, + "mean_token_accuracy": 0.2755391299724579, + "num_tokens": 80858424.0, + "step": 158 + }, + { + "epoch": 0.04299621416982152, + "grad_norm": 5.625, + "learning_rate": 0.005693693693693693, + "loss": 4.1559, + "mean_token_accuracy": 0.25689613819122314, + "num_tokens": 81382655.0, + "step": 159 + }, + { + "epoch": 0.04326663061114116, + "grad_norm": 6.46875, + "learning_rate": 0.00572972972972973, + "loss": 4.0816, + "mean_token_accuracy": 0.2521461248397827, + "num_tokens": 81906932.0, + "step": 160 + }, + { + "epoch": 0.04353704705246079, + "grad_norm": 5.21875, + "learning_rate": 0.005765765765765766, + "loss": 4.2787, + "mean_token_accuracy": 0.2687266767024994, + "num_tokens": 82403697.0, + "step": 161 + }, + { + "epoch": 0.043807463493780424, + "grad_norm": 5.6875, + "learning_rate": 0.005801801801801802, + "loss": 3.8571, + "mean_token_accuracy": 0.28497982025146484, + "num_tokens": 82876620.0, + "step": 162 + }, + { + "epoch": 0.04407787993510005, + "grad_norm": 7.3125, + "learning_rate": 0.005837837837837838, + "loss": 3.499, + "mean_token_accuracy": 0.29388970136642456, + "num_tokens": 83400690.0, + "step": 163 + }, + { + "epoch": 0.04434829637641969, + "grad_norm": 6.75, + "learning_rate": 0.005873873873873874, + "loss": 3.8452, + "mean_token_accuracy": 0.28647902607917786, + "num_tokens": 83924837.0, + "step": 164 + }, + { + "epoch": 0.04461871281773932, + "grad_norm": 5.6875, + "learning_rate": 0.005909909909909909, + "loss": 4.0439, + "mean_token_accuracy": 0.2679687738418579, + "num_tokens": 84425549.0, + "step": 165 + }, + { + "epoch": 0.04488912925905895, + "grad_norm": 7.6875, + "learning_rate": 0.005945945945945947, + "loss": 3.8735, + "mean_token_accuracy": 0.2906894087791443, + "num_tokens": 84915537.0, + "step": 166 + }, + { + "epoch": 0.045159545700378584, + "grad_norm": 6.90625, + "learning_rate": 0.005981981981981982, + "loss": 4.0159, + "mean_token_accuracy": 0.27447980642318726, + "num_tokens": 85439744.0, + "step": 167 + }, + { + "epoch": 0.04542996214169821, + "grad_norm": 6.125, + "learning_rate": 0.006018018018018018, + "loss": 3.8525, + "mean_token_accuracy": 0.29279109835624695, + "num_tokens": 85963725.0, + "step": 168 + }, + { + "epoch": 0.04570037858301785, + "grad_norm": 6.65625, + "learning_rate": 0.006054054054054054, + "loss": 4.118, + "mean_token_accuracy": 0.279152512550354, + "num_tokens": 86487758.0, + "step": 169 + }, + { + "epoch": 0.04597079502433748, + "grad_norm": 6.75, + "learning_rate": 0.00609009009009009, + "loss": 3.6383, + "mean_token_accuracy": 0.3120761513710022, + "num_tokens": 86953986.0, + "step": 170 + }, + { + "epoch": 0.046241211465657114, + "grad_norm": 34.25, + "learning_rate": 0.006126126126126126, + "loss": 9.5278, + "mean_token_accuracy": 0.03715433552861214, + "num_tokens": 87478231.0, + "step": 171 + }, + { + "epoch": 0.046511627906976744, + "grad_norm": 15.8125, + "learning_rate": 0.006162162162162163, + "loss": 4.3238, + "mean_token_accuracy": 0.27216610312461853, + "num_tokens": 87968693.0, + "step": 172 + }, + { + "epoch": 0.04678204434829638, + "grad_norm": 3.875, + "learning_rate": 0.0061981981981981986, + "loss": 4.0593, + "mean_token_accuracy": 0.2851804494857788, + "num_tokens": 88482764.0, + "step": 173 + }, + { + "epoch": 0.04705246078961601, + "grad_norm": 7.09375, + "learning_rate": 0.006234234234234234, + "loss": 4.1201, + "mean_token_accuracy": 0.2874716520309448, + "num_tokens": 89007037.0, + "step": 174 + }, + { + "epoch": 0.04732287723093564, + "grad_norm": 7.5625, + "learning_rate": 0.006270270270270271, + "loss": 3.9963, + "mean_token_accuracy": 0.27181103825569153, + "num_tokens": 89531232.0, + "step": 175 + }, + { + "epoch": 0.047593293672255274, + "grad_norm": 8.1875, + "learning_rate": 0.006306306306306306, + "loss": 3.8495, + "mean_token_accuracy": 0.2954772710800171, + "num_tokens": 90055378.0, + "step": 176 + }, + { + "epoch": 0.0478637101135749, + "grad_norm": 7.3125, + "learning_rate": 0.006342342342342342, + "loss": 3.9797, + "mean_token_accuracy": 0.28511685132980347, + "num_tokens": 90574969.0, + "step": 177 + }, + { + "epoch": 0.04813412655489454, + "grad_norm": 7.78125, + "learning_rate": 0.006378378378378379, + "loss": 3.8768, + "mean_token_accuracy": 0.2710055112838745, + "num_tokens": 91099222.0, + "step": 178 + }, + { + "epoch": 0.04840454299621417, + "grad_norm": 6.84375, + "learning_rate": 0.0064144144144144145, + "loss": 3.7503, + "mean_token_accuracy": 0.31182247400283813, + "num_tokens": 91623494.0, + "step": 179 + }, + { + "epoch": 0.048674959437533805, + "grad_norm": 5.71875, + "learning_rate": 0.00645045045045045, + "loss": 3.5801, + "mean_token_accuracy": 0.3196055293083191, + "num_tokens": 92147679.0, + "step": 180 + }, + { + "epoch": 0.048945375878853434, + "grad_norm": 6.3125, + "learning_rate": 0.006486486486486487, + "loss": 3.6606, + "mean_token_accuracy": 0.31389254331588745, + "num_tokens": 92671914.0, + "step": 181 + }, + { + "epoch": 0.04921579232017307, + "grad_norm": 6.0, + "learning_rate": 0.006522522522522523, + "loss": 3.7582, + "mean_token_accuracy": 0.2832849621772766, + "num_tokens": 93196088.0, + "step": 182 + }, + { + "epoch": 0.0494862087614927, + "grad_norm": 6.875, + "learning_rate": 0.006558558558558559, + "loss": 3.7451, + "mean_token_accuracy": 0.3210011124610901, + "num_tokens": 93713460.0, + "step": 183 + }, + { + "epoch": 0.04975662520281233, + "grad_norm": 5.71875, + "learning_rate": 0.006594594594594595, + "loss": 3.4997, + "mean_token_accuracy": 0.33678048849105835, + "num_tokens": 94237643.0, + "step": 184 + }, + { + "epoch": 0.050027041644131964, + "grad_norm": 6.3125, + "learning_rate": 0.00663063063063063, + "loss": 3.8853, + "mean_token_accuracy": 0.29498839378356934, + "num_tokens": 94747241.0, + "step": 185 + }, + { + "epoch": 0.05029745808545159, + "grad_norm": 5.625, + "learning_rate": 0.006666666666666666, + "loss": 3.3721, + "mean_token_accuracy": 0.3470045328140259, + "num_tokens": 95226907.0, + "step": 186 + }, + { + "epoch": 0.05056787452677123, + "grad_norm": 5.125, + "learning_rate": 0.006702702702702703, + "loss": 3.5267, + "mean_token_accuracy": 0.3408762216567993, + "num_tokens": 95751078.0, + "step": 187 + }, + { + "epoch": 0.05083829096809086, + "grad_norm": 7.90625, + "learning_rate": 0.006738738738738739, + "loss": 3.768, + "mean_token_accuracy": 0.31660470366477966, + "num_tokens": 96275169.0, + "step": 188 + }, + { + "epoch": 0.051108707409410495, + "grad_norm": 7.125, + "learning_rate": 0.006774774774774775, + "loss": 3.708, + "mean_token_accuracy": 0.3220239281654358, + "num_tokens": 96734378.0, + "step": 189 + }, + { + "epoch": 0.051379123850730124, + "grad_norm": 8.8125, + "learning_rate": 0.006810810810810811, + "loss": 3.6656, + "mean_token_accuracy": 0.33019766211509705, + "num_tokens": 97228434.0, + "step": 190 + }, + { + "epoch": 0.05164954029204975, + "grad_norm": 117.0, + "learning_rate": 0.006846846846846847, + "loss": 10.6254, + "mean_token_accuracy": 0.012333648279309273, + "num_tokens": 97699149.0, + "step": 191 + }, + { + "epoch": 0.05191995673336939, + "grad_norm": 15.25, + "learning_rate": 0.006882882882882883, + "loss": 3.957, + "mean_token_accuracy": 0.31971657276153564, + "num_tokens": 98196837.0, + "step": 192 + }, + { + "epoch": 0.05219037317468902, + "grad_norm": 4.5625, + "learning_rate": 0.00691891891891892, + "loss": 3.5873, + "mean_token_accuracy": 0.32943639159202576, + "num_tokens": 98720929.0, + "step": 193 + }, + { + "epoch": 0.052460789616008655, + "grad_norm": 6.875, + "learning_rate": 0.006954954954954955, + "loss": 3.5174, + "mean_token_accuracy": 0.3370971083641052, + "num_tokens": 99245210.0, + "step": 194 + }, + { + "epoch": 0.052731206057328284, + "grad_norm": 5.3125, + "learning_rate": 0.0069909909909909905, + "loss": 3.6187, + "mean_token_accuracy": 0.35257065296173096, + "num_tokens": 99723807.0, + "step": 195 + }, + { + "epoch": 0.05300162249864792, + "grad_norm": 7.5, + "learning_rate": 0.007027027027027027, + "loss": 3.584, + "mean_token_accuracy": 0.3473411202430725, + "num_tokens": 100140165.0, + "step": 196 + }, + { + "epoch": 0.05327203893996755, + "grad_norm": 6.0625, + "learning_rate": 0.007063063063063063, + "loss": 3.2935, + "mean_token_accuracy": 0.3800722360610962, + "num_tokens": 100605672.0, + "step": 197 + }, + { + "epoch": 0.053542455381287185, + "grad_norm": 8.1875, + "learning_rate": 0.007099099099099099, + "loss": 3.2348, + "mean_token_accuracy": 0.35315456986427307, + "num_tokens": 101129873.0, + "step": 198 + }, + { + "epoch": 0.053812871822606814, + "grad_norm": 6.5625, + "learning_rate": 0.007135135135135136, + "loss": 3.5233, + "mean_token_accuracy": 0.36206647753715515, + "num_tokens": 101600313.0, + "step": 199 + }, + { + "epoch": 0.05408328826392644, + "grad_norm": 6.9375, + "learning_rate": 0.0071711711711711715, + "loss": 3.5043, + "mean_token_accuracy": 0.3445475101470947, + "num_tokens": 102120921.0, + "step": 200 + }, + { + "epoch": 0.05435370470524608, + "grad_norm": 5.21875, + "learning_rate": 0.007207207207207207, + "loss": 3.5156, + "mean_token_accuracy": 0.35524874925613403, + "num_tokens": 102645161.0, + "step": 201 + }, + { + "epoch": 0.05462412114656571, + "grad_norm": 24.25, + "learning_rate": 0.007243243243243244, + "loss": 3.6179, + "mean_token_accuracy": 0.3584640622138977, + "num_tokens": 103169368.0, + "step": 202 + }, + { + "epoch": 0.054894537587885345, + "grad_norm": 5.59375, + "learning_rate": 0.00727927927927928, + "loss": 3.4541, + "mean_token_accuracy": 0.34446215629577637, + "num_tokens": 103693576.0, + "step": 203 + }, + { + "epoch": 0.055164954029204974, + "grad_norm": 5.0, + "learning_rate": 0.007315315315315315, + "loss": 3.4609, + "mean_token_accuracy": 0.36704587936401367, + "num_tokens": 104159468.0, + "step": 204 + }, + { + "epoch": 0.05543537047052461, + "grad_norm": 6.375, + "learning_rate": 0.0073513513513513516, + "loss": 3.4112, + "mean_token_accuracy": 0.35185784101486206, + "num_tokens": 104683725.0, + "step": 205 + }, + { + "epoch": 0.05570578691184424, + "grad_norm": 6.3125, + "learning_rate": 0.007387387387387387, + "loss": 3.3072, + "mean_token_accuracy": 0.365745484828949, + "num_tokens": 105155783.0, + "step": 206 + }, + { + "epoch": 0.055976203353163875, + "grad_norm": 6.09375, + "learning_rate": 0.007423423423423423, + "loss": 3.5775, + "mean_token_accuracy": 0.34331852197647095, + "num_tokens": 105680064.0, + "step": 207 + }, + { + "epoch": 0.056246619794483504, + "grad_norm": 7.53125, + "learning_rate": 0.00745945945945946, + "loss": 3.43, + "mean_token_accuracy": 0.338797926902771, + "num_tokens": 106204338.0, + "step": 208 + }, + { + "epoch": 0.056517036235803134, + "grad_norm": 8.4375, + "learning_rate": 0.007495495495495496, + "loss": 3.5629, + "mean_token_accuracy": 0.3531665802001953, + "num_tokens": 106728602.0, + "step": 209 + }, + { + "epoch": 0.05678745267712277, + "grad_norm": 7.0625, + "learning_rate": 0.007531531531531532, + "loss": 3.4865, + "mean_token_accuracy": 0.3734409213066101, + "num_tokens": 107221224.0, + "step": 210 + }, + { + "epoch": 0.0570578691184424, + "grad_norm": 245.0, + "learning_rate": 0.007567567567567568, + "loss": 15.6993, + "mean_token_accuracy": 0.001097214175388217, + "num_tokens": 107745390.0, + "step": 211 + }, + { + "epoch": 0.057328285559762035, + "grad_norm": 24.625, + "learning_rate": 0.007603603603603604, + "loss": 3.8645, + "mean_token_accuracy": 0.3082295060157776, + "num_tokens": 108269508.0, + "step": 212 + }, + { + "epoch": 0.057598702001081664, + "grad_norm": 5.5, + "learning_rate": 0.007639639639639639, + "loss": 3.4962, + "mean_token_accuracy": 0.35495176911354065, + "num_tokens": 108793666.0, + "step": 213 + }, + { + "epoch": 0.0578691184424013, + "grad_norm": 4.84375, + "learning_rate": 0.007675675675675676, + "loss": 3.4536, + "mean_token_accuracy": 0.3883427679538727, + "num_tokens": 109261396.0, + "step": 214 + }, + { + "epoch": 0.05813953488372093, + "grad_norm": 7.8125, + "learning_rate": 0.007711711711711712, + "loss": 3.4438, + "mean_token_accuracy": 0.3497556447982788, + "num_tokens": 109785674.0, + "step": 215 + }, + { + "epoch": 0.058409951325040566, + "grad_norm": 4.59375, + "learning_rate": 0.0077477477477477475, + "loss": 3.156, + "mean_token_accuracy": 0.3881916403770447, + "num_tokens": 110309862.0, + "step": 216 + }, + { + "epoch": 0.058680367766360195, + "grad_norm": 7.125, + "learning_rate": 0.007783783783783784, + "loss": 3.2479, + "mean_token_accuracy": 0.37312933802604675, + "num_tokens": 110834041.0, + "step": 217 + }, + { + "epoch": 0.058950784207679824, + "grad_norm": 6.375, + "learning_rate": 0.00781981981981982, + "loss": 3.6118, + "mean_token_accuracy": 0.3621352016925812, + "num_tokens": 111358292.0, + "step": 218 + }, + { + "epoch": 0.05922120064899946, + "grad_norm": 6.8125, + "learning_rate": 0.007855855855855855, + "loss": 3.1045, + "mean_token_accuracy": 0.382032185792923, + "num_tokens": 111845102.0, + "step": 219 + }, + { + "epoch": 0.05949161709031909, + "grad_norm": 8.75, + "learning_rate": 0.007891891891891892, + "loss": 3.4882, + "mean_token_accuracy": 0.36347129940986633, + "num_tokens": 112369381.0, + "step": 220 + }, + { + "epoch": 0.059762033531638725, + "grad_norm": 9.5, + "learning_rate": 0.007927927927927928, + "loss": 3.2893, + "mean_token_accuracy": 0.36917221546173096, + "num_tokens": 112893415.0, + "step": 221 + }, + { + "epoch": 0.060032449972958354, + "grad_norm": 6.375, + "learning_rate": 0.007963963963963963, + "loss": 3.2294, + "mean_token_accuracy": 0.4029357135295868, + "num_tokens": 113354162.0, + "step": 222 + }, + { + "epoch": 0.06030286641427799, + "grad_norm": 6.21875, + "learning_rate": 0.008, + "loss": 3.2888, + "mean_token_accuracy": 0.3711487054824829, + "num_tokens": 113878265.0, + "step": 223 + }, + { + "epoch": 0.06057328285559762, + "grad_norm": 4.84375, + "learning_rate": 0.008036036036036037, + "loss": 3.1617, + "mean_token_accuracy": 0.38348162174224854, + "num_tokens": 114402317.0, + "step": 224 + }, + { + "epoch": 0.060843699296917256, + "grad_norm": 4.96875, + "learning_rate": 0.008072072072072072, + "loss": 3.4026, + "mean_token_accuracy": 0.37100082635879517, + "num_tokens": 114896722.0, + "step": 225 + }, + { + "epoch": 0.061114115738236885, + "grad_norm": 4.625, + "learning_rate": 0.008108108108108109, + "loss": 3.248, + "mean_token_accuracy": 0.38638797402381897, + "num_tokens": 115420855.0, + "step": 226 + }, + { + "epoch": 0.061384532179556514, + "grad_norm": 6.875, + "learning_rate": 0.008144144144144144, + "loss": 3.4004, + "mean_token_accuracy": 0.3671044111251831, + "num_tokens": 115945043.0, + "step": 227 + }, + { + "epoch": 0.06165494862087615, + "grad_norm": 7.09375, + "learning_rate": 0.00818018018018018, + "loss": 3.4937, + "mean_token_accuracy": 0.3682820796966553, + "num_tokens": 116469309.0, + "step": 228 + }, + { + "epoch": 0.06192536506219578, + "grad_norm": 7.125, + "learning_rate": 0.008216216216216217, + "loss": 3.2297, + "mean_token_accuracy": 0.3920988440513611, + "num_tokens": 116955746.0, + "step": 229 + }, + { + "epoch": 0.062195781503515415, + "grad_norm": 5.90625, + "learning_rate": 0.008252252252252252, + "loss": 3.1745, + "mean_token_accuracy": 0.407388836145401, + "num_tokens": 117418660.0, + "step": 230 + }, + { + "epoch": 0.062466197944835045, + "grad_norm": 0.640625, + "learning_rate": 0.008288288288288289, + "loss": 11.0788, + "mean_token_accuracy": 0.0, + "num_tokens": 117942821.0, + "step": 231 + }, + { + "epoch": 0.06273661438615467, + "grad_norm": 20.75, + "learning_rate": 0.008324324324324325, + "loss": 3.7747, + "mean_token_accuracy": 0.3116956353187561, + "num_tokens": 118467048.0, + "step": 232 + }, + { + "epoch": 0.06300703082747432, + "grad_norm": 6.09375, + "learning_rate": 0.00836036036036036, + "loss": 3.6099, + "mean_token_accuracy": 0.34705454111099243, + "num_tokens": 118938682.0, + "step": 233 + }, + { + "epoch": 0.06327744726879395, + "grad_norm": 7.0, + "learning_rate": 0.008396396396396397, + "loss": 3.473, + "mean_token_accuracy": 0.35430318117141724, + "num_tokens": 119462807.0, + "step": 234 + }, + { + "epoch": 0.06354786371011358, + "grad_norm": 7.3125, + "learning_rate": 0.008432432432432434, + "loss": 3.1595, + "mean_token_accuracy": 0.36218690872192383, + "num_tokens": 119987074.0, + "step": 235 + }, + { + "epoch": 0.0638182801514332, + "grad_norm": 5.46875, + "learning_rate": 0.008468468468468469, + "loss": 3.3291, + "mean_token_accuracy": 0.355489045381546, + "num_tokens": 120511358.0, + "step": 236 + }, + { + "epoch": 0.06408869659275283, + "grad_norm": 7.09375, + "learning_rate": 0.008504504504504504, + "loss": 3.4919, + "mean_token_accuracy": 0.36466383934020996, + "num_tokens": 121035622.0, + "step": 237 + }, + { + "epoch": 0.06435911303407248, + "grad_norm": 5.8125, + "learning_rate": 0.00854054054054054, + "loss": 3.3124, + "mean_token_accuracy": 0.36840787529945374, + "num_tokens": 121559849.0, + "step": 238 + }, + { + "epoch": 0.0646295294753921, + "grad_norm": 6.96875, + "learning_rate": 0.008576576576576577, + "loss": 3.5467, + "mean_token_accuracy": 0.3625534772872925, + "num_tokens": 122056150.0, + "step": 239 + }, + { + "epoch": 0.06489994591671173, + "grad_norm": 25.0, + "learning_rate": 0.008612612612612612, + "loss": 3.2446, + "mean_token_accuracy": 0.3728850781917572, + "num_tokens": 122580376.0, + "step": 240 + }, + { + "epoch": 0.06517036235803136, + "grad_norm": 6.8125, + "learning_rate": 0.008648648648648649, + "loss": 3.3555, + "mean_token_accuracy": 0.3906978368759155, + "num_tokens": 123016782.0, + "step": 241 + }, + { + "epoch": 0.06544077879935101, + "grad_norm": 4.15625, + "learning_rate": 0.008684684684684685, + "loss": 3.2332, + "mean_token_accuracy": 0.3834397792816162, + "num_tokens": 123497362.0, + "step": 242 + }, + { + "epoch": 0.06571119524067064, + "grad_norm": 6.71875, + "learning_rate": 0.00872072072072072, + "loss": 3.4811, + "mean_token_accuracy": 0.36765480041503906, + "num_tokens": 123969258.0, + "step": 243 + }, + { + "epoch": 0.06598161168199027, + "grad_norm": 6.84375, + "learning_rate": 0.008756756756756757, + "loss": 3.5798, + "mean_token_accuracy": 0.3639242947101593, + "num_tokens": 124446311.0, + "step": 244 + }, + { + "epoch": 0.0662520281233099, + "grad_norm": 6.5625, + "learning_rate": 0.008792792792792794, + "loss": 3.6236, + "mean_token_accuracy": 0.35265201330184937, + "num_tokens": 124963238.0, + "step": 245 + }, + { + "epoch": 0.06652244456462952, + "grad_norm": 7.125, + "learning_rate": 0.008828828828828829, + "loss": 3.431, + "mean_token_accuracy": 0.3819119930267334, + "num_tokens": 125480450.0, + "step": 246 + }, + { + "epoch": 0.06679286100594917, + "grad_norm": 6.625, + "learning_rate": 0.008864864864864866, + "loss": 3.4263, + "mean_token_accuracy": 0.3615180253982544, + "num_tokens": 126004734.0, + "step": 247 + }, + { + "epoch": 0.0670632774472688, + "grad_norm": 6.09375, + "learning_rate": 0.0089009009009009, + "loss": 3.5607, + "mean_token_accuracy": 0.3666744828224182, + "num_tokens": 126529000.0, + "step": 248 + }, + { + "epoch": 0.06733369388858843, + "grad_norm": 7.34375, + "learning_rate": 0.008936936936936937, + "loss": 3.1218, + "mean_token_accuracy": 0.3334798216819763, + "num_tokens": 127053277.0, + "step": 249 + }, + { + "epoch": 0.06760411032990805, + "grad_norm": 6.28125, + "learning_rate": 0.008972972972972974, + "loss": 3.4037, + "mean_token_accuracy": 0.3620274066925049, + "num_tokens": 127577424.0, + "step": 250 + }, + { + "epoch": 0.0678745267712277, + "grad_norm": 1.015625, + "learning_rate": 0.009009009009009009, + "loss": 10.5811, + "mean_token_accuracy": 1.3129930266586598e-05, + "num_tokens": 128058622.0, + "step": 251 + }, + { + "epoch": 0.06814494321254733, + "grad_norm": 17.25, + "learning_rate": 0.009045045045045046, + "loss": 3.986, + "mean_token_accuracy": 0.3074209690093994, + "num_tokens": 128582903.0, + "step": 252 + }, + { + "epoch": 0.06841535965386696, + "grad_norm": 5.28125, + "learning_rate": 0.009081081081081082, + "loss": 3.6178, + "mean_token_accuracy": 0.36830735206604004, + "num_tokens": 129072012.0, + "step": 253 + }, + { + "epoch": 0.06868577609518658, + "grad_norm": 7.84375, + "learning_rate": 0.009117117117117117, + "loss": 3.5239, + "mean_token_accuracy": 0.3685789108276367, + "num_tokens": 129596261.0, + "step": 254 + }, + { + "epoch": 0.06895619253650621, + "grad_norm": 7.0, + "learning_rate": 0.009153153153153152, + "loss": 3.4824, + "mean_token_accuracy": 0.3389648199081421, + "num_tokens": 130073668.0, + "step": 255 + }, + { + "epoch": 0.06922660897782586, + "grad_norm": 6.6875, + "learning_rate": 0.00918918918918919, + "loss": 3.4064, + "mean_token_accuracy": 0.37897205352783203, + "num_tokens": 130597818.0, + "step": 256 + }, + { + "epoch": 0.06949702541914549, + "grad_norm": 7.65625, + "learning_rate": 0.009225225225225226, + "loss": 3.452, + "mean_token_accuracy": 0.3546660542488098, + "num_tokens": 131121941.0, + "step": 257 + }, + { + "epoch": 0.06976744186046512, + "grad_norm": 5.09375, + "learning_rate": 0.00926126126126126, + "loss": 3.4488, + "mean_token_accuracy": 0.4169479012489319, + "num_tokens": 131541947.0, + "step": 258 + }, + { + "epoch": 0.07003785830178474, + "grad_norm": 6.5, + "learning_rate": 0.009297297297297297, + "loss": 3.6377, + "mean_token_accuracy": 0.2976022958755493, + "num_tokens": 132066156.0, + "step": 259 + }, + { + "epoch": 0.07030827474310439, + "grad_norm": 5.25, + "learning_rate": 0.009333333333333334, + "loss": 3.4235, + "mean_token_accuracy": 0.3668389320373535, + "num_tokens": 132590406.0, + "step": 260 + }, + { + "epoch": 0.07057869118442402, + "grad_norm": 7.1875, + "learning_rate": 0.009369369369369369, + "loss": 3.1904, + "mean_token_accuracy": 0.3626983165740967, + "num_tokens": 133114665.0, + "step": 261 + }, + { + "epoch": 0.07084910762574365, + "grad_norm": 8.0, + "learning_rate": 0.009405405405405406, + "loss": 3.2147, + "mean_token_accuracy": 0.39265966415405273, + "num_tokens": 133563554.0, + "step": 262 + }, + { + "epoch": 0.07111952406706328, + "grad_norm": 5.59375, + "learning_rate": 0.009441441441441442, + "loss": 3.3469, + "mean_token_accuracy": 0.362255334854126, + "num_tokens": 134087800.0, + "step": 263 + }, + { + "epoch": 0.0713899405083829, + "grad_norm": 5.78125, + "learning_rate": 0.009477477477477477, + "loss": 3.4008, + "mean_token_accuracy": 0.35963016748428345, + "num_tokens": 134612065.0, + "step": 264 + }, + { + "epoch": 0.07166035694970255, + "grad_norm": 4.625, + "learning_rate": 0.009513513513513514, + "loss": 3.1077, + "mean_token_accuracy": 0.4022493362426758, + "num_tokens": 135136309.0, + "step": 265 + }, + { + "epoch": 0.07193077339102218, + "grad_norm": 5.78125, + "learning_rate": 0.00954954954954955, + "loss": 3.2967, + "mean_token_accuracy": 0.34811729192733765, + "num_tokens": 135660377.0, + "step": 266 + }, + { + "epoch": 0.0722011898323418, + "grad_norm": 5.4375, + "learning_rate": 0.009585585585585586, + "loss": 3.2467, + "mean_token_accuracy": 0.3809099793434143, + "num_tokens": 136136792.0, + "step": 267 + }, + { + "epoch": 0.07247160627366143, + "grad_norm": 6.40625, + "learning_rate": 0.009621621621621623, + "loss": 3.282, + "mean_token_accuracy": 0.4120599925518036, + "num_tokens": 136596668.0, + "step": 268 + }, + { + "epoch": 0.07274202271498108, + "grad_norm": 5.1875, + "learning_rate": 0.009657657657657658, + "loss": 3.2559, + "mean_token_accuracy": 0.3908980190753937, + "num_tokens": 137120903.0, + "step": 269 + }, + { + "epoch": 0.0730124391563007, + "grad_norm": 6.53125, + "learning_rate": 0.009693693693693694, + "loss": 3.2063, + "mean_token_accuracy": 0.373100608587265, + "num_tokens": 137645055.0, + "step": 270 + }, + { + "epoch": 0.07328285559762034, + "grad_norm": 20.0, + "learning_rate": 0.009729729729729731, + "loss": 15.6136, + "mean_token_accuracy": 0.0290638767182827, + "num_tokens": 138104389.0, + "step": 271 + }, + { + "epoch": 0.07355327203893997, + "grad_norm": 18.125, + "learning_rate": 0.009765765765765766, + "loss": 3.6717, + "mean_token_accuracy": 0.3281637132167816, + "num_tokens": 138628557.0, + "step": 272 + }, + { + "epoch": 0.0738236884802596, + "grad_norm": 4.71875, + "learning_rate": 0.009801801801801803, + "loss": 3.4363, + "mean_token_accuracy": 0.3715510666370392, + "num_tokens": 139152741.0, + "step": 273 + }, + { + "epoch": 0.07409410492157924, + "grad_norm": 4.8125, + "learning_rate": 0.00983783783783784, + "loss": 3.3983, + "mean_token_accuracy": 0.3869807720184326, + "num_tokens": 139621635.0, + "step": 274 + }, + { + "epoch": 0.07436452136289887, + "grad_norm": 5.6875, + "learning_rate": 0.009873873873873874, + "loss": 3.4399, + "mean_token_accuracy": 0.37338560819625854, + "num_tokens": 140145873.0, + "step": 275 + }, + { + "epoch": 0.0746349378042185, + "grad_norm": 4.5625, + "learning_rate": 0.00990990990990991, + "loss": 3.3655, + "mean_token_accuracy": 0.41806069016456604, + "num_tokens": 140605313.0, + "step": 276 + }, + { + "epoch": 0.07490535424553812, + "grad_norm": 7.1875, + "learning_rate": 0.009945945945945946, + "loss": 3.4697, + "mean_token_accuracy": 0.3685745298862457, + "num_tokens": 141129440.0, + "step": 277 + }, + { + "epoch": 0.07517577068685775, + "grad_norm": 4.21875, + "learning_rate": 0.009981981981981983, + "loss": 3.1909, + "mean_token_accuracy": 0.38795939087867737, + "num_tokens": 141605033.0, + "step": 278 + }, + { + "epoch": 0.0754461871281774, + "grad_norm": 6.09375, + "learning_rate": 0.010018018018018018, + "loss": 3.1198, + "mean_token_accuracy": 0.40002840757369995, + "num_tokens": 142129205.0, + "step": 279 + }, + { + "epoch": 0.07571660356949703, + "grad_norm": 5.8125, + "learning_rate": 0.010054054054054054, + "loss": 3.457, + "mean_token_accuracy": 0.4031257629394531, + "num_tokens": 142653392.0, + "step": 280 + }, + { + "epoch": 0.07598702001081666, + "grad_norm": 3.84375, + "learning_rate": 0.01009009009009009, + "loss": 3.084, + "mean_token_accuracy": 0.38879650831222534, + "num_tokens": 143177643.0, + "step": 281 + }, + { + "epoch": 0.07625743645213628, + "grad_norm": 4.34375, + "learning_rate": 0.010126126126126128, + "loss": 3.4812, + "mean_token_accuracy": 0.36020171642303467, + "num_tokens": 143701922.0, + "step": 282 + }, + { + "epoch": 0.07652785289345593, + "grad_norm": 5.03125, + "learning_rate": 0.010162162162162163, + "loss": 3.4876, + "mean_token_accuracy": 0.349264919757843, + "num_tokens": 144226111.0, + "step": 283 + }, + { + "epoch": 0.07679826933477556, + "grad_norm": 4.78125, + "learning_rate": 0.010198198198198198, + "loss": 3.1154, + "mean_token_accuracy": 0.3958362936973572, + "num_tokens": 144750329.0, + "step": 284 + }, + { + "epoch": 0.07706868577609519, + "grad_norm": 4.75, + "learning_rate": 0.010234234234234234, + "loss": 3.1044, + "mean_token_accuracy": 0.40579700469970703, + "num_tokens": 145274553.0, + "step": 285 + }, + { + "epoch": 0.07733910221741482, + "grad_norm": 27.0, + "learning_rate": 0.01027027027027027, + "loss": 3.2199, + "mean_token_accuracy": 0.39092254638671875, + "num_tokens": 145786475.0, + "step": 286 + }, + { + "epoch": 0.07760951865873444, + "grad_norm": 6.25, + "learning_rate": 0.010306306306306306, + "loss": 3.3212, + "mean_token_accuracy": 0.3715789318084717, + "num_tokens": 146291480.0, + "step": 287 + }, + { + "epoch": 0.07787993510005409, + "grad_norm": 4.125, + "learning_rate": 0.010342342342342343, + "loss": 3.4231, + "mean_token_accuracy": 0.37572553753852844, + "num_tokens": 146815538.0, + "step": 288 + }, + { + "epoch": 0.07815035154137372, + "grad_norm": 6.0, + "learning_rate": 0.01037837837837838, + "loss": 3.1334, + "mean_token_accuracy": 0.39749354124069214, + "num_tokens": 147295956.0, + "step": 289 + }, + { + "epoch": 0.07842076798269335, + "grad_norm": 5.4375, + "learning_rate": 0.010414414414414415, + "loss": 3.3273, + "mean_token_accuracy": 0.4127611517906189, + "num_tokens": 147748938.0, + "step": 290 + }, + { + "epoch": 0.07869118442401297, + "grad_norm": 4.6875, + "learning_rate": 0.010450450450450451, + "loss": 11.5383, + "mean_token_accuracy": 0.0, + "num_tokens": 148170030.0, + "step": 291 + }, + { + "epoch": 0.07896160086533262, + "grad_norm": 9.6875, + "learning_rate": 0.010486486486486486, + "loss": 3.7506, + "mean_token_accuracy": 0.3216307759284973, + "num_tokens": 148628511.0, + "step": 292 + }, + { + "epoch": 0.07923201730665225, + "grad_norm": 5.28125, + "learning_rate": 0.010522522522522521, + "loss": 3.4866, + "mean_token_accuracy": 0.3614810109138489, + "num_tokens": 149152790.0, + "step": 293 + }, + { + "epoch": 0.07950243374797188, + "grad_norm": 5.4375, + "learning_rate": 0.01055855855855856, + "loss": 3.2631, + "mean_token_accuracy": 0.39304429292678833, + "num_tokens": 149676971.0, + "step": 294 + }, + { + "epoch": 0.0797728501892915, + "grad_norm": 6.1875, + "learning_rate": 0.010594594594594595, + "loss": 3.4489, + "mean_token_accuracy": 0.36168164014816284, + "num_tokens": 150201229.0, + "step": 295 + }, + { + "epoch": 0.08004326663061113, + "grad_norm": 4.84375, + "learning_rate": 0.010630630630630631, + "loss": 3.0578, + "mean_token_accuracy": 0.4142261743545532, + "num_tokens": 150725476.0, + "step": 296 + }, + { + "epoch": 0.08031368307193078, + "grad_norm": 5.90625, + "learning_rate": 0.010666666666666666, + "loss": 3.3174, + "mean_token_accuracy": 0.3961215615272522, + "num_tokens": 151197117.0, + "step": 297 + }, + { + "epoch": 0.0805840995132504, + "grad_norm": 6.0, + "learning_rate": 0.010702702702702703, + "loss": 3.3621, + "mean_token_accuracy": 0.3756311535835266, + "num_tokens": 151721299.0, + "step": 298 + }, + { + "epoch": 0.08085451595457004, + "grad_norm": 5.125, + "learning_rate": 0.010738738738738738, + "loss": 3.4178, + "mean_token_accuracy": 0.37468579411506653, + "num_tokens": 152245447.0, + "step": 299 + }, + { + "epoch": 0.08112493239588967, + "grad_norm": 5.4375, + "learning_rate": 0.010774774774774776, + "loss": 3.2651, + "mean_token_accuracy": 0.41818028688430786, + "num_tokens": 152704241.0, + "step": 300 + }, + { + "epoch": 0.08139534883720931, + "grad_norm": 5.21875, + "learning_rate": 0.010810810810810811, + "loss": 3.1252, + "mean_token_accuracy": 0.4180424213409424, + "num_tokens": 153196089.0, + "step": 301 + }, + { + "epoch": 0.08166576527852894, + "grad_norm": 4.28125, + "learning_rate": 0.010846846846846848, + "loss": 3.2435, + "mean_token_accuracy": 0.4069098234176636, + "num_tokens": 153720293.0, + "step": 302 + }, + { + "epoch": 0.08193618171984857, + "grad_norm": 3.453125, + "learning_rate": 0.010882882882882883, + "loss": 3.1422, + "mean_token_accuracy": 0.43049710988998413, + "num_tokens": 154133919.0, + "step": 303 + }, + { + "epoch": 0.0822065981611682, + "grad_norm": 3.65625, + "learning_rate": 0.010918918918918918, + "loss": 3.3724, + "mean_token_accuracy": 0.35429948568344116, + "num_tokens": 154658164.0, + "step": 304 + }, + { + "epoch": 0.08247701460248782, + "grad_norm": 5.5625, + "learning_rate": 0.010954954954954955, + "loss": 3.2131, + "mean_token_accuracy": 0.40096843242645264, + "num_tokens": 155127255.0, + "step": 305 + }, + { + "epoch": 0.08274743104380747, + "grad_norm": 6.625, + "learning_rate": 0.010990990990990991, + "loss": 3.3535, + "mean_token_accuracy": 0.37843483686447144, + "num_tokens": 155651405.0, + "step": 306 + }, + { + "epoch": 0.0830178474851271, + "grad_norm": 5.0, + "learning_rate": 0.011027027027027028, + "loss": 3.1645, + "mean_token_accuracy": 0.39047789573669434, + "num_tokens": 156175689.0, + "step": 307 + }, + { + "epoch": 0.08328826392644673, + "grad_norm": 4.46875, + "learning_rate": 0.011063063063063063, + "loss": 3.1571, + "mean_token_accuracy": 0.390042245388031, + "num_tokens": 156699937.0, + "step": 308 + }, + { + "epoch": 0.08355868036776636, + "grad_norm": 3.84375, + "learning_rate": 0.0110990990990991, + "loss": 3.3013, + "mean_token_accuracy": 0.38376879692077637, + "num_tokens": 157224144.0, + "step": 309 + }, + { + "epoch": 0.083829096809086, + "grad_norm": 4.375, + "learning_rate": 0.011135135135135135, + "loss": 3.3193, + "mean_token_accuracy": 0.3900904059410095, + "num_tokens": 157748328.0, + "step": 310 + }, + { + "epoch": 0.08409951325040563, + "grad_norm": 1.375, + "learning_rate": 0.01117117117117117, + "loss": 10.2871, + "mean_token_accuracy": 1.3315939213498496e-05, + "num_tokens": 158229813.0, + "step": 311 + }, + { + "epoch": 0.08436992969172526, + "grad_norm": 39.75, + "learning_rate": 0.011207207207207208, + "loss": 3.9377, + "mean_token_accuracy": 0.2922338545322418, + "num_tokens": 158754076.0, + "step": 312 + }, + { + "epoch": 0.08464034613304489, + "grad_norm": 5.625, + "learning_rate": 0.011243243243243243, + "loss": 3.7197, + "mean_token_accuracy": 0.34525057673454285, + "num_tokens": 159239857.0, + "step": 313 + }, + { + "epoch": 0.08491076257436452, + "grad_norm": 3.375, + "learning_rate": 0.01127927927927928, + "loss": 3.4292, + "mean_token_accuracy": 0.3820377588272095, + "num_tokens": 159757956.0, + "step": 314 + }, + { + "epoch": 0.08518117901568416, + "grad_norm": 7.96875, + "learning_rate": 0.011315315315315315, + "loss": 3.7049, + "mean_token_accuracy": 0.33400899171829224, + "num_tokens": 160223980.0, + "step": 315 + }, + { + "epoch": 0.08545159545700379, + "grad_norm": 3.65625, + "learning_rate": 0.011351351351351352, + "loss": 3.3707, + "mean_token_accuracy": 0.3813416659832001, + "num_tokens": 160711704.0, + "step": 316 + }, + { + "epoch": 0.08572201189832342, + "grad_norm": 5.1875, + "learning_rate": 0.011387387387387387, + "loss": 3.2618, + "mean_token_accuracy": 0.3656204342842102, + "num_tokens": 161235927.0, + "step": 317 + }, + { + "epoch": 0.08599242833964305, + "grad_norm": 4.03125, + "learning_rate": 0.011423423423423425, + "loss": 3.3239, + "mean_token_accuracy": 0.3692861795425415, + "num_tokens": 161760056.0, + "step": 318 + }, + { + "epoch": 0.08626284478096269, + "grad_norm": 4.375, + "learning_rate": 0.01145945945945946, + "loss": 3.3508, + "mean_token_accuracy": 0.37187209725379944, + "num_tokens": 162284149.0, + "step": 319 + }, + { + "epoch": 0.08653326122228232, + "grad_norm": 4.125, + "learning_rate": 0.011495495495495497, + "loss": 3.335, + "mean_token_accuracy": 0.3900958001613617, + "num_tokens": 162751893.0, + "step": 320 + }, + { + "epoch": 0.08680367766360195, + "grad_norm": 5.09375, + "learning_rate": 0.011531531531531532, + "loss": 3.0701, + "mean_token_accuracy": 0.4107723832130432, + "num_tokens": 163219551.0, + "step": 321 + }, + { + "epoch": 0.08707409410492158, + "grad_norm": 3.78125, + "learning_rate": 0.011567567567567567, + "loss": 3.0291, + "mean_token_accuracy": 0.41660717129707336, + "num_tokens": 163709907.0, + "step": 322 + }, + { + "epoch": 0.0873445105462412, + "grad_norm": 6.8125, + "learning_rate": 0.011603603603603603, + "loss": 3.5761, + "mean_token_accuracy": 0.3483666479587555, + "num_tokens": 164234175.0, + "step": 323 + }, + { + "epoch": 0.08761492698756085, + "grad_norm": 4.15625, + "learning_rate": 0.01163963963963964, + "loss": 3.3428, + "mean_token_accuracy": 0.3766704201698303, + "num_tokens": 164758395.0, + "step": 324 + }, + { + "epoch": 0.08788534342888048, + "grad_norm": 5.4375, + "learning_rate": 0.011675675675675677, + "loss": 3.2382, + "mean_token_accuracy": 0.3975132703781128, + "num_tokens": 165238063.0, + "step": 325 + }, + { + "epoch": 0.0881557598702001, + "grad_norm": 3.75, + "learning_rate": 0.011711711711711712, + "loss": 3.2165, + "mean_token_accuracy": 0.3829250931739807, + "num_tokens": 165762299.0, + "step": 326 + }, + { + "epoch": 0.08842617631151974, + "grad_norm": 4.21875, + "learning_rate": 0.011747747747747748, + "loss": 3.1601, + "mean_token_accuracy": 0.38543760776519775, + "num_tokens": 166286543.0, + "step": 327 + }, + { + "epoch": 0.08869659275283938, + "grad_norm": 4.9375, + "learning_rate": 0.011783783783783783, + "loss": 3.4922, + "mean_token_accuracy": 0.3703402280807495, + "num_tokens": 166810812.0, + "step": 328 + }, + { + "epoch": 0.08896700919415901, + "grad_norm": 3.1875, + "learning_rate": 0.011819819819819818, + "loss": 2.9957, + "mean_token_accuracy": 0.4012971520423889, + "num_tokens": 167334981.0, + "step": 329 + }, + { + "epoch": 0.08923742563547864, + "grad_norm": 5.34375, + "learning_rate": 0.011855855855855857, + "loss": 2.9753, + "mean_token_accuracy": 0.40028804540634155, + "num_tokens": 167859208.0, + "step": 330 + }, + { + "epoch": 0.08950784207679827, + "grad_norm": 5.15625, + "learning_rate": 0.011891891891891894, + "loss": 10.0516, + "mean_token_accuracy": 0.007627969142049551, + "num_tokens": 168383488.0, + "step": 331 + }, + { + "epoch": 0.0897782585181179, + "grad_norm": 21.375, + "learning_rate": 0.011927927927927929, + "loss": 4.2017, + "mean_token_accuracy": 0.34171614050865173, + "num_tokens": 168848238.0, + "step": 332 + }, + { + "epoch": 0.09004867495943754, + "grad_norm": 4.96875, + "learning_rate": 0.011963963963963964, + "loss": 3.3783, + "mean_token_accuracy": 0.3737707734107971, + "num_tokens": 169351405.0, + "step": 333 + }, + { + "epoch": 0.09031909140075717, + "grad_norm": 3.328125, + "learning_rate": 0.012, + "loss": 3.6276, + "mean_token_accuracy": 0.3311178684234619, + "num_tokens": 169875667.0, + "step": 334 + }, + { + "epoch": 0.0905895078420768, + "grad_norm": 6.53125, + "learning_rate": 0.012036036036036035, + "loss": 3.5061, + "mean_token_accuracy": 0.35076621174812317, + "num_tokens": 170399946.0, + "step": 335 + }, + { + "epoch": 0.09085992428339643, + "grad_norm": 3.8125, + "learning_rate": 0.012072072072072074, + "loss": 3.3021, + "mean_token_accuracy": 0.3808322548866272, + "num_tokens": 170924110.0, + "step": 336 + }, + { + "epoch": 0.09113034072471607, + "grad_norm": 5.96875, + "learning_rate": 0.012108108108108109, + "loss": 3.3366, + "mean_token_accuracy": 0.376431941986084, + "num_tokens": 171448286.0, + "step": 337 + }, + { + "epoch": 0.0914007571660357, + "grad_norm": 3.546875, + "learning_rate": 0.012144144144144145, + "loss": 3.2327, + "mean_token_accuracy": 0.3863096833229065, + "num_tokens": 171972522.0, + "step": 338 + }, + { + "epoch": 0.09167117360735533, + "grad_norm": 4.46875, + "learning_rate": 0.01218018018018018, + "loss": 3.3398, + "mean_token_accuracy": 0.36713656783103943, + "num_tokens": 172496796.0, + "step": 339 + }, + { + "epoch": 0.09194159004867496, + "grad_norm": 3.703125, + "learning_rate": 0.012216216216216215, + "loss": 3.2719, + "mean_token_accuracy": 0.3761569857597351, + "num_tokens": 173020996.0, + "step": 340 + }, + { + "epoch": 0.09221200648999459, + "grad_norm": 4.46875, + "learning_rate": 0.012252252252252252, + "loss": 3.4902, + "mean_token_accuracy": 0.37170493602752686, + "num_tokens": 173545161.0, + "step": 341 + }, + { + "epoch": 0.09248242293131423, + "grad_norm": 3.953125, + "learning_rate": 0.01228828828828829, + "loss": 3.4295, + "mean_token_accuracy": 0.3890172243118286, + "num_tokens": 174006520.0, + "step": 342 + }, + { + "epoch": 0.09275283937263386, + "grad_norm": 3.71875, + "learning_rate": 0.012324324324324325, + "loss": 3.3039, + "mean_token_accuracy": 0.3777199685573578, + "num_tokens": 174496678.0, + "step": 343 + }, + { + "epoch": 0.09302325581395349, + "grad_norm": 4.125, + "learning_rate": 0.01236036036036036, + "loss": 2.98, + "mean_token_accuracy": 0.3955426812171936, + "num_tokens": 174916980.0, + "step": 344 + }, + { + "epoch": 0.09329367225527312, + "grad_norm": 3.5, + "learning_rate": 0.012396396396396397, + "loss": 3.2973, + "mean_token_accuracy": 0.3687516450881958, + "num_tokens": 175441258.0, + "step": 345 + }, + { + "epoch": 0.09356408869659276, + "grad_norm": 3.671875, + "learning_rate": 0.012432432432432432, + "loss": 3.1206, + "mean_token_accuracy": 0.4045751094818115, + "num_tokens": 175965496.0, + "step": 346 + }, + { + "epoch": 0.09383450513791239, + "grad_norm": 4.375, + "learning_rate": 0.012468468468468469, + "loss": 3.3424, + "mean_token_accuracy": 0.3734549283981323, + "num_tokens": 176489780.0, + "step": 347 + }, + { + "epoch": 0.09410492157923202, + "grad_norm": 4.21875, + "learning_rate": 0.012504504504504505, + "loss": 3.5457, + "mean_token_accuracy": 0.36188584566116333, + "num_tokens": 177013967.0, + "step": 348 + }, + { + "epoch": 0.09437533802055165, + "grad_norm": 4.03125, + "learning_rate": 0.012540540540540542, + "loss": 3.1577, + "mean_token_accuracy": 0.3899577260017395, + "num_tokens": 177538242.0, + "step": 349 + }, + { + "epoch": 0.09464575446187128, + "grad_norm": 3.71875, + "learning_rate": 0.012576576576576577, + "loss": 3.0742, + "mean_token_accuracy": 0.39818066358566284, + "num_tokens": 178062289.0, + "step": 350 + }, + { + "epoch": 0.09491617090319092, + "grad_norm": 1.21875, + "learning_rate": 0.012612612612612612, + "loss": 10.4096, + "mean_token_accuracy": 1.569340383866802e-05, + "num_tokens": 178586413.0, + "step": 351 + }, + { + "epoch": 0.09518658734451055, + "grad_norm": 10.875, + "learning_rate": 0.012648648648648649, + "loss": 3.6979, + "mean_token_accuracy": 0.30794423818588257, + "num_tokens": 179110675.0, + "step": 352 + }, + { + "epoch": 0.09545700378583018, + "grad_norm": 3.96875, + "learning_rate": 0.012684684684684684, + "loss": 3.4657, + "mean_token_accuracy": 0.35828953981399536, + "num_tokens": 179634893.0, + "step": 353 + }, + { + "epoch": 0.0957274202271498, + "grad_norm": 5.28125, + "learning_rate": 0.012720720720720722, + "loss": 3.574, + "mean_token_accuracy": 0.36261504888534546, + "num_tokens": 180159147.0, + "step": 354 + }, + { + "epoch": 0.09599783666846945, + "grad_norm": 4.84375, + "learning_rate": 0.012756756756756757, + "loss": 3.3801, + "mean_token_accuracy": 0.38302716612815857, + "num_tokens": 180678603.0, + "step": 355 + }, + { + "epoch": 0.09626825310978908, + "grad_norm": 4.59375, + "learning_rate": 0.012792792792792794, + "loss": 3.3198, + "mean_token_accuracy": 0.37422436475753784, + "num_tokens": 181176206.0, + "step": 356 + }, + { + "epoch": 0.09653866955110871, + "grad_norm": 3.953125, + "learning_rate": 0.012828828828828829, + "loss": 3.3456, + "mean_token_accuracy": 0.377391517162323, + "num_tokens": 181700344.0, + "step": 357 + }, + { + "epoch": 0.09680908599242834, + "grad_norm": 5.25, + "learning_rate": 0.012864864864864864, + "loss": 3.4368, + "mean_token_accuracy": 0.3666076362133026, + "num_tokens": 182224509.0, + "step": 358 + }, + { + "epoch": 0.09707950243374797, + "grad_norm": 3.59375, + "learning_rate": 0.0129009009009009, + "loss": 3.445, + "mean_token_accuracy": 0.4006487727165222, + "num_tokens": 182690643.0, + "step": 359 + }, + { + "epoch": 0.09734991887506761, + "grad_norm": 4.46875, + "learning_rate": 0.012936936936936939, + "loss": 3.5562, + "mean_token_accuracy": 0.3650418221950531, + "num_tokens": 183195289.0, + "step": 360 + }, + { + "epoch": 0.09762033531638724, + "grad_norm": 4.3125, + "learning_rate": 0.012972972972972974, + "loss": 3.2781, + "mean_token_accuracy": 0.3848832845687866, + "num_tokens": 183719471.0, + "step": 361 + }, + { + "epoch": 0.09789075175770687, + "grad_norm": 5.8125, + "learning_rate": 0.013009009009009009, + "loss": 3.4654, + "mean_token_accuracy": 0.37645643949508667, + "num_tokens": 184190385.0, + "step": 362 + }, + { + "epoch": 0.0981611681990265, + "grad_norm": 4.375, + "learning_rate": 0.013045045045045046, + "loss": 3.272, + "mean_token_accuracy": 0.40620890259742737, + "num_tokens": 184714613.0, + "step": 363 + }, + { + "epoch": 0.09843158464034614, + "grad_norm": 4.46875, + "learning_rate": 0.01308108108108108, + "loss": 3.5344, + "mean_token_accuracy": 0.3643481731414795, + "num_tokens": 185238891.0, + "step": 364 + }, + { + "epoch": 0.09870200108166577, + "grad_norm": 3.765625, + "learning_rate": 0.013117117117117117, + "loss": 3.0863, + "mean_token_accuracy": 0.37224188446998596, + "num_tokens": 185763043.0, + "step": 365 + }, + { + "epoch": 0.0989724175229854, + "grad_norm": 3.1875, + "learning_rate": 0.013153153153153154, + "loss": 3.4805, + "mean_token_accuracy": 0.3655497431755066, + "num_tokens": 186287311.0, + "step": 366 + }, + { + "epoch": 0.09924283396430503, + "grad_norm": 5.6875, + "learning_rate": 0.01318918918918919, + "loss": 3.3216, + "mean_token_accuracy": 0.3712417483329773, + "num_tokens": 186811524.0, + "step": 367 + }, + { + "epoch": 0.09951325040562466, + "grad_norm": 3.953125, + "learning_rate": 0.013225225225225226, + "loss": 3.4707, + "mean_token_accuracy": 0.3712874948978424, + "num_tokens": 187335756.0, + "step": 368 + }, + { + "epoch": 0.0997836668469443, + "grad_norm": 5.03125, + "learning_rate": 0.01326126126126126, + "loss": 3.2076, + "mean_token_accuracy": 0.37496882677078247, + "num_tokens": 187860017.0, + "step": 369 + }, + { + "epoch": 0.10005408328826393, + "grad_norm": 4.125, + "learning_rate": 0.013297297297297297, + "loss": 3.4506, + "mean_token_accuracy": 0.373843789100647, + "num_tokens": 188384203.0, + "step": 370 + }, + { + "epoch": 0.10032449972958356, + "grad_norm": 2.125, + "learning_rate": 0.013333333333333332, + "loss": 9.2342, + "mean_token_accuracy": 0.0021661731880158186, + "num_tokens": 188903646.0, + "step": 371 + }, + { + "epoch": 0.10059491617090319, + "grad_norm": 14.4375, + "learning_rate": 0.01336936936936937, + "loss": 4.2466, + "mean_token_accuracy": 0.2761128544807434, + "num_tokens": 189385535.0, + "step": 372 + }, + { + "epoch": 0.10086533261222283, + "grad_norm": 3.71875, + "learning_rate": 0.013405405405405406, + "loss": 3.4982, + "mean_token_accuracy": 0.35481593012809753, + "num_tokens": 189900943.0, + "step": 373 + }, + { + "epoch": 0.10113574905354246, + "grad_norm": 2.796875, + "learning_rate": 0.013441441441441443, + "loss": 3.3761, + "mean_token_accuracy": 0.36975789070129395, + "num_tokens": 190425221.0, + "step": 374 + }, + { + "epoch": 0.10140616549486209, + "grad_norm": 4.6875, + "learning_rate": 0.013477477477477478, + "loss": 3.4005, + "mean_token_accuracy": 0.40063443779945374, + "num_tokens": 190834197.0, + "step": 375 + }, + { + "epoch": 0.10167658193618172, + "grad_norm": 3.078125, + "learning_rate": 0.013513513513513514, + "loss": 3.2099, + "mean_token_accuracy": 0.38547757267951965, + "num_tokens": 191319760.0, + "step": 376 + }, + { + "epoch": 0.10194699837750135, + "grad_norm": 4.21875, + "learning_rate": 0.01354954954954955, + "loss": 3.4916, + "mean_token_accuracy": 0.36678430438041687, + "num_tokens": 191843851.0, + "step": 377 + }, + { + "epoch": 0.10221741481882099, + "grad_norm": 4.09375, + "learning_rate": 0.013585585585585584, + "loss": 3.4859, + "mean_token_accuracy": 0.3703417181968689, + "num_tokens": 192368129.0, + "step": 378 + }, + { + "epoch": 0.10248783126014062, + "grad_norm": 4.59375, + "learning_rate": 0.013621621621621623, + "loss": 3.4227, + "mean_token_accuracy": 0.3480582535266876, + "num_tokens": 192837230.0, + "step": 379 + }, + { + "epoch": 0.10275824770146025, + "grad_norm": 4.0625, + "learning_rate": 0.013657657657657658, + "loss": 3.2614, + "mean_token_accuracy": 0.4195748567581177, + "num_tokens": 193297675.0, + "step": 380 + }, + { + "epoch": 0.10302866414277988, + "grad_norm": 4.375, + "learning_rate": 0.013693693693693694, + "loss": 3.3874, + "mean_token_accuracy": 0.3802534341812134, + "num_tokens": 193821949.0, + "step": 381 + }, + { + "epoch": 0.1032990805840995, + "grad_norm": 3.703125, + "learning_rate": 0.01372972972972973, + "loss": 3.3371, + "mean_token_accuracy": 0.3920239508152008, + "num_tokens": 194296337.0, + "step": 382 + }, + { + "epoch": 0.10356949702541915, + "grad_norm": 2.953125, + "learning_rate": 0.013765765765765766, + "loss": 3.1961, + "mean_token_accuracy": 0.40175169706344604, + "num_tokens": 194820555.0, + "step": 383 + }, + { + "epoch": 0.10383991346673878, + "grad_norm": 3.828125, + "learning_rate": 0.013801801801801801, + "loss": 3.3117, + "mean_token_accuracy": 0.39428025484085083, + "num_tokens": 195344839.0, + "step": 384 + }, + { + "epoch": 0.10411032990805841, + "grad_norm": 3.609375, + "learning_rate": 0.01383783783783784, + "loss": 3.2981, + "mean_token_accuracy": 0.39919304847717285, + "num_tokens": 195812164.0, + "step": 385 + }, + { + "epoch": 0.10438074634937804, + "grad_norm": 3.90625, + "learning_rate": 0.013873873873873874, + "loss": 3.3692, + "mean_token_accuracy": 0.3899870812892914, + "num_tokens": 196298158.0, + "step": 386 + }, + { + "epoch": 0.10465116279069768, + "grad_norm": 4.375, + "learning_rate": 0.01390990990990991, + "loss": 3.416, + "mean_token_accuracy": 0.38159292936325073, + "num_tokens": 196822361.0, + "step": 387 + }, + { + "epoch": 0.10492157923201731, + "grad_norm": 5.21875, + "learning_rate": 0.013945945945945946, + "loss": 3.4844, + "mean_token_accuracy": 0.37778401374816895, + "num_tokens": 197346544.0, + "step": 388 + }, + { + "epoch": 0.10519199567333694, + "grad_norm": 4.25, + "learning_rate": 0.013981981981981981, + "loss": 3.2746, + "mean_token_accuracy": 0.38976144790649414, + "num_tokens": 197870826.0, + "step": 389 + }, + { + "epoch": 0.10546241211465657, + "grad_norm": 5.21875, + "learning_rate": 0.014018018018018018, + "loss": 3.1757, + "mean_token_accuracy": 0.379690945148468, + "num_tokens": 198394977.0, + "step": 390 + }, + { + "epoch": 0.1057328285559762, + "grad_norm": 10.1875, + "learning_rate": 0.014054054054054054, + "loss": 12.1719, + "mean_token_accuracy": 0.04190313443541527, + "num_tokens": 198919164.0, + "step": 391 + }, + { + "epoch": 0.10600324499729584, + "grad_norm": 7.53125, + "learning_rate": 0.014090090090090091, + "loss": 4.0015, + "mean_token_accuracy": 0.33613038063049316, + "num_tokens": 199443442.0, + "step": 392 + }, + { + "epoch": 0.10627366143861547, + "grad_norm": 9.4375, + "learning_rate": 0.014126126126126126, + "loss": 3.5022, + "mean_token_accuracy": 0.3572489023208618, + "num_tokens": 199967515.0, + "step": 393 + }, + { + "epoch": 0.1065440778799351, + "grad_norm": 4.6875, + "learning_rate": 0.014162162162162163, + "loss": 3.4839, + "mean_token_accuracy": 0.3577330708503723, + "num_tokens": 200491047.0, + "step": 394 + }, + { + "epoch": 0.10681449432125473, + "grad_norm": 5.90625, + "learning_rate": 0.014198198198198198, + "loss": 3.5715, + "mean_token_accuracy": 0.3588724136352539, + "num_tokens": 201015228.0, + "step": 395 + }, + { + "epoch": 0.10708491076257437, + "grad_norm": 4.65625, + "learning_rate": 0.014234234234234233, + "loss": 3.4615, + "mean_token_accuracy": 0.36357375979423523, + "num_tokens": 201539357.0, + "step": 396 + }, + { + "epoch": 0.107355327203894, + "grad_norm": 5.34375, + "learning_rate": 0.014270270270270271, + "loss": 3.121, + "mean_token_accuracy": 0.4702547490596771, + "num_tokens": 202063639.0, + "step": 397 + }, + { + "epoch": 0.10762574364521363, + "grad_norm": 2.515625, + "learning_rate": 0.014306306306306306, + "loss": 3.2955, + "mean_token_accuracy": 0.39359432458877563, + "num_tokens": 202587701.0, + "step": 398 + }, + { + "epoch": 0.10789616008653326, + "grad_norm": 4.375, + "learning_rate": 0.014342342342342343, + "loss": 3.3202, + "mean_token_accuracy": 0.38569241762161255, + "num_tokens": 203088357.0, + "step": 399 + }, + { + "epoch": 0.10816657652785289, + "grad_norm": 4.21875, + "learning_rate": 0.014378378378378378, + "loss": 3.1093, + "mean_token_accuracy": 0.39163029193878174, + "num_tokens": 203612484.0, + "step": 400 + }, + { + "epoch": 0.10843699296917253, + "grad_norm": 3.703125, + "learning_rate": 0.014414414414414415, + "loss": 3.48, + "mean_token_accuracy": 0.3762412369251251, + "num_tokens": 204136753.0, + "step": 401 + }, + { + "epoch": 0.10870740941049216, + "grad_norm": 3.734375, + "learning_rate": 0.01445045045045045, + "loss": 3.473, + "mean_token_accuracy": 0.3872537612915039, + "num_tokens": 204568560.0, + "step": 402 + }, + { + "epoch": 0.10897782585181179, + "grad_norm": 3.25, + "learning_rate": 0.014486486486486488, + "loss": 3.2216, + "mean_token_accuracy": 0.42583492398262024, + "num_tokens": 205031094.0, + "step": 403 + }, + { + "epoch": 0.10924824229313142, + "grad_norm": 4.0625, + "learning_rate": 0.014522522522522523, + "loss": 3.1561, + "mean_token_accuracy": 0.38919416069984436, + "num_tokens": 205555314.0, + "step": 404 + }, + { + "epoch": 0.10951865873445106, + "grad_norm": 3.578125, + "learning_rate": 0.01455855855855856, + "loss": 3.3383, + "mean_token_accuracy": 0.40418344736099243, + "num_tokens": 206045747.0, + "step": 405 + }, + { + "epoch": 0.10978907517577069, + "grad_norm": 4.71875, + "learning_rate": 0.014594594594594595, + "loss": 3.3348, + "mean_token_accuracy": 0.3756635785102844, + "num_tokens": 206564284.0, + "step": 406 + }, + { + "epoch": 0.11005949161709032, + "grad_norm": 3.375, + "learning_rate": 0.01463063063063063, + "loss": 3.0227, + "mean_token_accuracy": 0.40409114956855774, + "num_tokens": 207068821.0, + "step": 407 + }, + { + "epoch": 0.11032990805840995, + "grad_norm": 3.484375, + "learning_rate": 0.014666666666666666, + "loss": 3.3525, + "mean_token_accuracy": 0.3987481892108917, + "num_tokens": 207592890.0, + "step": 408 + }, + { + "epoch": 0.11060032449972958, + "grad_norm": 3.34375, + "learning_rate": 0.014702702702702703, + "loss": 3.3343, + "mean_token_accuracy": 0.3790501654148102, + "num_tokens": 208116969.0, + "step": 409 + }, + { + "epoch": 0.11087074094104922, + "grad_norm": 3.78125, + "learning_rate": 0.01473873873873874, + "loss": 3.1994, + "mean_token_accuracy": 0.3802141547203064, + "num_tokens": 208641248.0, + "step": 410 + }, + { + "epoch": 0.11114115738236885, + "grad_norm": 2.78125, + "learning_rate": 0.014774774774774775, + "loss": 11.5328, + "mean_token_accuracy": 0.0, + "num_tokens": 209165530.0, + "step": 411 + }, + { + "epoch": 0.11141157382368848, + "grad_norm": 8.125, + "learning_rate": 0.014810810810810811, + "loss": 3.6555, + "mean_token_accuracy": 0.31980228424072266, + "num_tokens": 209689756.0, + "step": 412 + }, + { + "epoch": 0.11168199026500811, + "grad_norm": 2.96875, + "learning_rate": 0.014846846846846846, + "loss": 3.3162, + "mean_token_accuracy": 0.37675002217292786, + "num_tokens": 210168984.0, + "step": 413 + }, + { + "epoch": 0.11195240670632775, + "grad_norm": 3.171875, + "learning_rate": 0.014882882882882881, + "loss": 3.0798, + "mean_token_accuracy": 0.3972213566303253, + "num_tokens": 210693189.0, + "step": 414 + }, + { + "epoch": 0.11222282314764738, + "grad_norm": 5.3125, + "learning_rate": 0.01491891891891892, + "loss": 3.4828, + "mean_token_accuracy": 0.37634778022766113, + "num_tokens": 211217406.0, + "step": 415 + }, + { + "epoch": 0.11249323958896701, + "grad_norm": 4.53125, + "learning_rate": 0.014954954954954955, + "loss": 3.4528, + "mean_token_accuracy": 0.3632332980632782, + "num_tokens": 211710689.0, + "step": 416 + }, + { + "epoch": 0.11276365603028664, + "grad_norm": 3.453125, + "learning_rate": 0.014990990990990992, + "loss": 3.2387, + "mean_token_accuracy": 0.3899117112159729, + "num_tokens": 212234826.0, + "step": 417 + }, + { + "epoch": 0.11303407247160627, + "grad_norm": 3.4375, + "learning_rate": 0.015027027027027027, + "loss": 3.5759, + "mean_token_accuracy": 0.34738028049468994, + "num_tokens": 212759092.0, + "step": 418 + }, + { + "epoch": 0.11330448891292591, + "grad_norm": 3.03125, + "learning_rate": 0.015063063063063063, + "loss": 3.1252, + "mean_token_accuracy": 0.40234044194221497, + "num_tokens": 213283142.0, + "step": 419 + }, + { + "epoch": 0.11357490535424554, + "grad_norm": 4.0625, + "learning_rate": 0.015099099099099098, + "loss": 3.3581, + "mean_token_accuracy": 0.39009249210357666, + "num_tokens": 213807344.0, + "step": 420 + }, + { + "epoch": 0.11384532179556517, + "grad_norm": 3.4375, + "learning_rate": 0.015135135135135137, + "loss": 3.2676, + "mean_token_accuracy": 0.3786548376083374, + "num_tokens": 214331509.0, + "step": 421 + }, + { + "epoch": 0.1141157382368848, + "grad_norm": 3.296875, + "learning_rate": 0.015171171171171172, + "loss": 3.4339, + "mean_token_accuracy": 0.36693859100341797, + "num_tokens": 214855745.0, + "step": 422 + }, + { + "epoch": 0.11438615467820444, + "grad_norm": 3.390625, + "learning_rate": 0.015207207207207208, + "loss": 3.4964, + "mean_token_accuracy": 0.3588467836380005, + "num_tokens": 215343556.0, + "step": 423 + }, + { + "epoch": 0.11465657111952407, + "grad_norm": 3.109375, + "learning_rate": 0.015243243243243243, + "loss": 3.032, + "mean_token_accuracy": 0.41700392961502075, + "num_tokens": 215867804.0, + "step": 424 + }, + { + "epoch": 0.1149269875608437, + "grad_norm": 3.234375, + "learning_rate": 0.015279279279279278, + "loss": 3.4439, + "mean_token_accuracy": 0.3676285743713379, + "num_tokens": 216372644.0, + "step": 425 + }, + { + "epoch": 0.11519740400216333, + "grad_norm": 4.625, + "learning_rate": 0.015315315315315315, + "loss": 3.3698, + "mean_token_accuracy": 0.3693413734436035, + "num_tokens": 216896903.0, + "step": 426 + }, + { + "epoch": 0.11546782044348296, + "grad_norm": 3.21875, + "learning_rate": 0.015351351351351352, + "loss": 3.2922, + "mean_token_accuracy": 0.38975584506988525, + "num_tokens": 217421162.0, + "step": 427 + }, + { + "epoch": 0.1157382368848026, + "grad_norm": 3.890625, + "learning_rate": 0.015387387387387388, + "loss": 3.1951, + "mean_token_accuracy": 0.37634289264678955, + "num_tokens": 217945432.0, + "step": 428 + }, + { + "epoch": 0.11600865332612223, + "grad_norm": 5.1875, + "learning_rate": 0.015423423423423423, + "loss": 3.4472, + "mean_token_accuracy": 0.3688328266143799, + "num_tokens": 218469640.0, + "step": 429 + }, + { + "epoch": 0.11627906976744186, + "grad_norm": 5.1875, + "learning_rate": 0.01545945945945946, + "loss": 3.4785, + "mean_token_accuracy": 0.36875927448272705, + "num_tokens": 218993844.0, + "step": 430 + }, + { + "epoch": 0.11654948620876149, + "grad_norm": 2.109375, + "learning_rate": 0.015495495495495495, + "loss": 10.1981, + "mean_token_accuracy": 2.575499274826143e-05, + "num_tokens": 219508946.0, + "step": 431 + }, + { + "epoch": 0.11681990265008113, + "grad_norm": 8.6875, + "learning_rate": 0.01553153153153153, + "loss": 3.916, + "mean_token_accuracy": 0.33324694633483887, + "num_tokens": 220033111.0, + "step": 432 + }, + { + "epoch": 0.11709031909140076, + "grad_norm": 2.9375, + "learning_rate": 0.015567567567567569, + "loss": 3.3932, + "mean_token_accuracy": 0.3811063766479492, + "num_tokens": 220557003.0, + "step": 433 + }, + { + "epoch": 0.11736073553272039, + "grad_norm": 5.40625, + "learning_rate": 0.015603603603603605, + "loss": 3.556, + "mean_token_accuracy": 0.34562551975250244, + "num_tokens": 221081261.0, + "step": 434 + }, + { + "epoch": 0.11763115197404002, + "grad_norm": 3.0625, + "learning_rate": 0.01563963963963964, + "loss": 3.2608, + "mean_token_accuracy": 0.37198883295059204, + "num_tokens": 221605543.0, + "step": 435 + }, + { + "epoch": 0.11790156841535965, + "grad_norm": 4.15625, + "learning_rate": 0.015675675675675675, + "loss": 3.2371, + "mean_token_accuracy": 0.364715039730072, + "num_tokens": 222086494.0, + "step": 436 + }, + { + "epoch": 0.11817198485667929, + "grad_norm": 2.921875, + "learning_rate": 0.01571171171171171, + "loss": 3.3968, + "mean_token_accuracy": 0.37861377000808716, + "num_tokens": 222610727.0, + "step": 437 + }, + { + "epoch": 0.11844240129799892, + "grad_norm": 5.53125, + "learning_rate": 0.01574774774774775, + "loss": 3.5626, + "mean_token_accuracy": 0.3429536819458008, + "num_tokens": 223130205.0, + "step": 438 + }, + { + "epoch": 0.11871281773931855, + "grad_norm": 2.640625, + "learning_rate": 0.015783783783783784, + "loss": 3.1137, + "mean_token_accuracy": 0.3713279962539673, + "num_tokens": 223648948.0, + "step": 439 + }, + { + "epoch": 0.11898323418063818, + "grad_norm": 3.90625, + "learning_rate": 0.015819819819819822, + "loss": 3.4577, + "mean_token_accuracy": 0.35429635643959045, + "num_tokens": 224173230.0, + "step": 440 + }, + { + "epoch": 0.11925365062195782, + "grad_norm": 3.875, + "learning_rate": 0.015855855855855857, + "loss": 3.3934, + "mean_token_accuracy": 0.3797808885574341, + "num_tokens": 224697334.0, + "step": 441 + }, + { + "epoch": 0.11952406706327745, + "grad_norm": 3.84375, + "learning_rate": 0.015891891891891892, + "loss": 3.5791, + "mean_token_accuracy": 0.3724026679992676, + "num_tokens": 225221549.0, + "step": 442 + }, + { + "epoch": 0.11979448350459708, + "grad_norm": 3.296875, + "learning_rate": 0.015927927927927927, + "loss": 3.1808, + "mean_token_accuracy": 0.39386630058288574, + "num_tokens": 225745729.0, + "step": 443 + }, + { + "epoch": 0.12006489994591671, + "grad_norm": 3.328125, + "learning_rate": 0.015963963963963962, + "loss": 3.4394, + "mean_token_accuracy": 0.3790827989578247, + "num_tokens": 226252053.0, + "step": 444 + }, + { + "epoch": 0.12033531638723634, + "grad_norm": 3.390625, + "learning_rate": 0.016, + "loss": 3.4335, + "mean_token_accuracy": 0.37350600957870483, + "num_tokens": 226729878.0, + "step": 445 + }, + { + "epoch": 0.12060573282855598, + "grad_norm": 3.46875, + "learning_rate": 0.016036036036036035, + "loss": 3.3959, + "mean_token_accuracy": 0.35825783014297485, + "num_tokens": 227231111.0, + "step": 446 + }, + { + "epoch": 0.12087614926987561, + "grad_norm": 3.078125, + "learning_rate": 0.016072072072072074, + "loss": 3.3871, + "mean_token_accuracy": 0.3834838569164276, + "num_tokens": 227755343.0, + "step": 447 + }, + { + "epoch": 0.12114656571119524, + "grad_norm": 4.375, + "learning_rate": 0.01610810810810811, + "loss": 3.3497, + "mean_token_accuracy": 0.3654416501522064, + "num_tokens": 228279531.0, + "step": 448 + }, + { + "epoch": 0.12141698215251487, + "grad_norm": 4.0, + "learning_rate": 0.016144144144144144, + "loss": 3.1891, + "mean_token_accuracy": 0.36128997802734375, + "num_tokens": 228803612.0, + "step": 449 + }, + { + "epoch": 0.12168739859383451, + "grad_norm": 2.78125, + "learning_rate": 0.01618018018018018, + "loss": 3.3138, + "mean_token_accuracy": 0.37255051732063293, + "num_tokens": 229327883.0, + "step": 450 + }, + { + "epoch": 0.12195781503515414, + "grad_norm": 2.453125, + "learning_rate": 0.016216216216216217, + "loss": 9.449, + "mean_token_accuracy": 0.010673320852220058, + "num_tokens": 229852036.0, + "step": 451 + }, + { + "epoch": 0.12222823147647377, + "grad_norm": 6.5, + "learning_rate": 0.016252252252252252, + "loss": 3.8933, + "mean_token_accuracy": 0.2928033471107483, + "num_tokens": 230376222.0, + "step": 452 + }, + { + "epoch": 0.1224986479177934, + "grad_norm": 1.8046875, + "learning_rate": 0.016288288288288287, + "loss": 3.4787, + "mean_token_accuracy": 0.3867323398590088, + "num_tokens": 230861559.0, + "step": 453 + }, + { + "epoch": 0.12276906435911303, + "grad_norm": 3.109375, + "learning_rate": 0.016324324324324326, + "loss": 3.1988, + "mean_token_accuracy": 0.3876558542251587, + "num_tokens": 231364583.0, + "step": 454 + }, + { + "epoch": 0.12303948080043267, + "grad_norm": 4.75, + "learning_rate": 0.01636036036036036, + "loss": 3.5037, + "mean_token_accuracy": 0.34131675958633423, + "num_tokens": 231888716.0, + "step": 455 + }, + { + "epoch": 0.1233098972417523, + "grad_norm": 3.59375, + "learning_rate": 0.016396396396396395, + "loss": 3.5008, + "mean_token_accuracy": 0.35605162382125854, + "num_tokens": 232412817.0, + "step": 456 + }, + { + "epoch": 0.12358031368307193, + "grad_norm": 3.171875, + "learning_rate": 0.016432432432432434, + "loss": 3.3528, + "mean_token_accuracy": 0.3686584234237671, + "num_tokens": 232937035.0, + "step": 457 + }, + { + "epoch": 0.12385073012439156, + "grad_norm": 3.21875, + "learning_rate": 0.01646846846846847, + "loss": 3.4214, + "mean_token_accuracy": 0.3618983030319214, + "num_tokens": 233461249.0, + "step": 458 + }, + { + "epoch": 0.1241211465657112, + "grad_norm": 3.65625, + "learning_rate": 0.016504504504504504, + "loss": 3.2895, + "mean_token_accuracy": 0.36981695890426636, + "num_tokens": 233985455.0, + "step": 459 + }, + { + "epoch": 0.12439156300703083, + "grad_norm": 4.0625, + "learning_rate": 0.016540540540540542, + "loss": 2.9658, + "mean_token_accuracy": 0.42121291160583496, + "num_tokens": 234509677.0, + "step": 460 + }, + { + "epoch": 0.12466197944835046, + "grad_norm": 3.53125, + "learning_rate": 0.016576576576576577, + "loss": 3.3296, + "mean_token_accuracy": 0.36430200934410095, + "num_tokens": 235033919.0, + "step": 461 + }, + { + "epoch": 0.12493239588967009, + "grad_norm": 3.96875, + "learning_rate": 0.016612612612612612, + "loss": 3.4768, + "mean_token_accuracy": 0.361985445022583, + "num_tokens": 235558124.0, + "step": 462 + }, + { + "epoch": 0.12520281233098973, + "grad_norm": 4.6875, + "learning_rate": 0.01664864864864865, + "loss": 3.5429, + "mean_token_accuracy": 0.3757792115211487, + "num_tokens": 236082380.0, + "step": 463 + }, + { + "epoch": 0.12547322877230935, + "grad_norm": 3.921875, + "learning_rate": 0.016684684684684686, + "loss": 3.4277, + "mean_token_accuracy": 0.3514975309371948, + "num_tokens": 236606659.0, + "step": 464 + }, + { + "epoch": 0.125743645213629, + "grad_norm": 2.640625, + "learning_rate": 0.01672072072072072, + "loss": 3.2712, + "mean_token_accuracy": 0.3849625289440155, + "num_tokens": 237130898.0, + "step": 465 + }, + { + "epoch": 0.12601406165494863, + "grad_norm": 3.71875, + "learning_rate": 0.016756756756756756, + "loss": 3.2455, + "mean_token_accuracy": 0.37170690298080444, + "num_tokens": 237655037.0, + "step": 466 + }, + { + "epoch": 0.12628447809626825, + "grad_norm": 3.765625, + "learning_rate": 0.016792792792792794, + "loss": 3.4055, + "mean_token_accuracy": 0.34731632471084595, + "num_tokens": 238179243.0, + "step": 467 + }, + { + "epoch": 0.1265548945375879, + "grad_norm": 3.921875, + "learning_rate": 0.01682882882882883, + "loss": 3.5161, + "mean_token_accuracy": 0.36786240339279175, + "num_tokens": 238703421.0, + "step": 468 + }, + { + "epoch": 0.1268253109789075, + "grad_norm": 3.140625, + "learning_rate": 0.016864864864864867, + "loss": 3.4026, + "mean_token_accuracy": 0.3790878653526306, + "num_tokens": 239227681.0, + "step": 469 + }, + { + "epoch": 0.12709572742022715, + "grad_norm": 3.734375, + "learning_rate": 0.016900900900900902, + "loss": 3.3797, + "mean_token_accuracy": 0.3639812469482422, + "num_tokens": 239751805.0, + "step": 470 + }, + { + "epoch": 0.1273661438615468, + "grad_norm": 1.34375, + "learning_rate": 0.016936936936936937, + "loss": 10.7392, + "mean_token_accuracy": 7.316417395486496e-06, + "num_tokens": 240276080.0, + "step": 471 + }, + { + "epoch": 0.1276365603028664, + "grad_norm": 12.125, + "learning_rate": 0.016972972972972972, + "loss": 3.9804, + "mean_token_accuracy": 0.2765960693359375, + "num_tokens": 240800315.0, + "step": 472 + }, + { + "epoch": 0.12790697674418605, + "grad_norm": 2.578125, + "learning_rate": 0.017009009009009007, + "loss": 3.4134, + "mean_token_accuracy": 0.35206228494644165, + "num_tokens": 241299801.0, + "step": 473 + }, + { + "epoch": 0.12817739318550567, + "grad_norm": 2.265625, + "learning_rate": 0.017045045045045046, + "loss": 3.5081, + "mean_token_accuracy": 0.3742418885231018, + "num_tokens": 241823961.0, + "step": 474 + }, + { + "epoch": 0.1284478096268253, + "grad_norm": 4.1875, + "learning_rate": 0.01708108108108108, + "loss": 3.3674, + "mean_token_accuracy": 0.3595058023929596, + "num_tokens": 242290617.0, + "step": 475 + }, + { + "epoch": 0.12871822606814495, + "grad_norm": 3.453125, + "learning_rate": 0.01711711711711712, + "loss": 3.418, + "mean_token_accuracy": 0.3785686790943146, + "num_tokens": 242814772.0, + "step": 476 + }, + { + "epoch": 0.12898864250946457, + "grad_norm": 3.71875, + "learning_rate": 0.017153153153153154, + "loss": 3.4697, + "mean_token_accuracy": 0.36512845754623413, + "num_tokens": 243281628.0, + "step": 477 + }, + { + "epoch": 0.1292590589507842, + "grad_norm": 4.03125, + "learning_rate": 0.01718918918918919, + "loss": 3.6214, + "mean_token_accuracy": 0.3653690218925476, + "num_tokens": 243805889.0, + "step": 478 + }, + { + "epoch": 0.12952947539210383, + "grad_norm": 4.0, + "learning_rate": 0.017225225225225224, + "loss": 3.4985, + "mean_token_accuracy": 0.3608347475528717, + "num_tokens": 244330153.0, + "step": 479 + }, + { + "epoch": 0.12979989183342347, + "grad_norm": 4.125, + "learning_rate": 0.01726126126126126, + "loss": 3.5834, + "mean_token_accuracy": 0.37549784779548645, + "num_tokens": 244801005.0, + "step": 480 + }, + { + "epoch": 0.1300703082747431, + "grad_norm": 3.875, + "learning_rate": 0.017297297297297298, + "loss": 3.3437, + "mean_token_accuracy": 0.3918059170246124, + "num_tokens": 245325244.0, + "step": 481 + }, + { + "epoch": 0.13034072471606273, + "grad_norm": 3.8125, + "learning_rate": 0.017333333333333336, + "loss": 3.6069, + "mean_token_accuracy": 0.32765740156173706, + "num_tokens": 245849427.0, + "step": 482 + }, + { + "epoch": 0.13061114115738237, + "grad_norm": 3.140625, + "learning_rate": 0.01736936936936937, + "loss": 3.4298, + "mean_token_accuracy": 0.3794148862361908, + "num_tokens": 246373701.0, + "step": 483 + }, + { + "epoch": 0.13088155759870201, + "grad_norm": 3.03125, + "learning_rate": 0.017405405405405406, + "loss": 3.6572, + "mean_token_accuracy": 0.3555411100387573, + "num_tokens": 246846816.0, + "step": 484 + }, + { + "epoch": 0.13115197404002163, + "grad_norm": 3.046875, + "learning_rate": 0.01744144144144144, + "loss": 3.384, + "mean_token_accuracy": 0.3873428702354431, + "num_tokens": 247346206.0, + "step": 485 + }, + { + "epoch": 0.13142239048134127, + "grad_norm": 3.296875, + "learning_rate": 0.017477477477477476, + "loss": 3.2705, + "mean_token_accuracy": 0.3812018632888794, + "num_tokens": 247870468.0, + "step": 486 + }, + { + "epoch": 0.1316928069226609, + "grad_norm": 3.03125, + "learning_rate": 0.017513513513513514, + "loss": 3.1259, + "mean_token_accuracy": 0.4063922166824341, + "num_tokens": 248394580.0, + "step": 487 + }, + { + "epoch": 0.13196322336398053, + "grad_norm": 3.125, + "learning_rate": 0.01754954954954955, + "loss": 3.158, + "mean_token_accuracy": 0.39620256423950195, + "num_tokens": 248858860.0, + "step": 488 + }, + { + "epoch": 0.13223363980530017, + "grad_norm": 3.734375, + "learning_rate": 0.017585585585585588, + "loss": 3.3602, + "mean_token_accuracy": 0.3399791121482849, + "num_tokens": 249383113.0, + "step": 489 + }, + { + "epoch": 0.1325040562466198, + "grad_norm": 15.0625, + "learning_rate": 0.017621621621621623, + "loss": 3.2378, + "mean_token_accuracy": 0.39492684602737427, + "num_tokens": 249907228.0, + "step": 490 + }, + { + "epoch": 0.13277447268793943, + "grad_norm": 4.65625, + "learning_rate": 0.017657657657657658, + "loss": 10.1265, + "mean_token_accuracy": 0.00030820141546428204, + "num_tokens": 250388054.0, + "step": 491 + }, + { + "epoch": 0.13304488912925905, + "grad_norm": 6.0, + "learning_rate": 0.017693693693693693, + "loss": 3.8614, + "mean_token_accuracy": 0.310328871011734, + "num_tokens": 250899195.0, + "step": 492 + }, + { + "epoch": 0.1333153055705787, + "grad_norm": 4.0625, + "learning_rate": 0.01772972972972973, + "loss": 3.5337, + "mean_token_accuracy": 0.350527822971344, + "num_tokens": 251423327.0, + "step": 493 + }, + { + "epoch": 0.13358572201189833, + "grad_norm": 4.5, + "learning_rate": 0.017765765765765766, + "loss": 3.4785, + "mean_token_accuracy": 0.399026483297348, + "num_tokens": 251872966.0, + "step": 494 + }, + { + "epoch": 0.13385613845321795, + "grad_norm": 3.421875, + "learning_rate": 0.0178018018018018, + "loss": 3.5914, + "mean_token_accuracy": 0.33834439516067505, + "num_tokens": 252397228.0, + "step": 495 + }, + { + "epoch": 0.1341265548945376, + "grad_norm": 4.21875, + "learning_rate": 0.01783783783783784, + "loss": 3.665, + "mean_token_accuracy": 0.3712007999420166, + "num_tokens": 252921338.0, + "step": 496 + }, + { + "epoch": 0.1343969713358572, + "grad_norm": 2.90625, + "learning_rate": 0.017873873873873874, + "loss": 3.5929, + "mean_token_accuracy": 0.3546021580696106, + "num_tokens": 253391549.0, + "step": 497 + }, + { + "epoch": 0.13466738777717685, + "grad_norm": 3.96875, + "learning_rate": 0.01790990990990991, + "loss": 3.6904, + "mean_token_accuracy": 0.3503772020339966, + "num_tokens": 253915726.0, + "step": 498 + }, + { + "epoch": 0.1349378042184965, + "grad_norm": 3.046875, + "learning_rate": 0.017945945945945948, + "loss": 3.2948, + "mean_token_accuracy": 0.3719966411590576, + "num_tokens": 254382112.0, + "step": 499 + }, + { + "epoch": 0.1352082206598161, + "grad_norm": 2.71875, + "learning_rate": 0.017981981981981983, + "loss": 3.406, + "mean_token_accuracy": 0.3779029846191406, + "num_tokens": 254890489.0, + "step": 500 + }, + { + "epoch": 0.13547863710113575, + "grad_norm": 3.4375, + "learning_rate": 0.018018018018018018, + "loss": 3.4187, + "mean_token_accuracy": 0.35546985268592834, + "num_tokens": 255414709.0, + "step": 501 + }, + { + "epoch": 0.1357490535424554, + "grad_norm": 3.5, + "learning_rate": 0.018054054054054053, + "loss": 3.5057, + "mean_token_accuracy": 0.38429027795791626, + "num_tokens": 255905108.0, + "step": 502 + }, + { + "epoch": 0.136019469983775, + "grad_norm": 3.28125, + "learning_rate": 0.01809009009009009, + "loss": 3.323, + "mean_token_accuracy": 0.35390183329582214, + "num_tokens": 256429347.0, + "step": 503 + }, + { + "epoch": 0.13628988642509465, + "grad_norm": 2.671875, + "learning_rate": 0.018126126126126126, + "loss": 3.2753, + "mean_token_accuracy": 0.4005722403526306, + "num_tokens": 256859421.0, + "step": 504 + }, + { + "epoch": 0.13656030286641427, + "grad_norm": 3.203125, + "learning_rate": 0.018162162162162165, + "loss": 3.4887, + "mean_token_accuracy": 0.36165183782577515, + "num_tokens": 257383698.0, + "step": 505 + }, + { + "epoch": 0.1368307193077339, + "grad_norm": 4.1875, + "learning_rate": 0.0181981981981982, + "loss": 3.3401, + "mean_token_accuracy": 0.37371939420700073, + "num_tokens": 257907977.0, + "step": 506 + }, + { + "epoch": 0.13710113574905355, + "grad_norm": 4.03125, + "learning_rate": 0.018234234234234235, + "loss": 3.3369, + "mean_token_accuracy": 0.37592774629592896, + "num_tokens": 258422083.0, + "step": 507 + }, + { + "epoch": 0.13737155219037317, + "grad_norm": 3.078125, + "learning_rate": 0.01827027027027027, + "loss": 3.5434, + "mean_token_accuracy": 0.3545660972595215, + "num_tokens": 258946344.0, + "step": 508 + }, + { + "epoch": 0.1376419686316928, + "grad_norm": 3.578125, + "learning_rate": 0.018306306306306305, + "loss": 3.3541, + "mean_token_accuracy": 0.3969081938266754, + "num_tokens": 259470568.0, + "step": 509 + }, + { + "epoch": 0.13791238507301243, + "grad_norm": 4.03125, + "learning_rate": 0.018342342342342343, + "loss": 3.2711, + "mean_token_accuracy": 0.39655327796936035, + "num_tokens": 259932412.0, + "step": 510 + }, + { + "epoch": 0.13818280151433207, + "grad_norm": 205.0, + "learning_rate": 0.01837837837837838, + "loss": 22.5755, + "mean_token_accuracy": 8.10977417131653e-06, + "num_tokens": 260417352.0, + "step": 511 + }, + { + "epoch": 0.13845321795565171, + "grad_norm": 7.5625, + "learning_rate": 0.018414414414414416, + "loss": 4.2042, + "mean_token_accuracy": 0.3246574401855469, + "num_tokens": 260904017.0, + "step": 512 + }, + { + "epoch": 0.13872363439697133, + "grad_norm": 2.46875, + "learning_rate": 0.01845045045045045, + "loss": 3.5722, + "mean_token_accuracy": 0.3734402060508728, + "num_tokens": 261381824.0, + "step": 513 + }, + { + "epoch": 0.13899405083829097, + "grad_norm": 4.65625, + "learning_rate": 0.018486486486486486, + "loss": 3.5869, + "mean_token_accuracy": 0.3515256941318512, + "num_tokens": 261906012.0, + "step": 514 + }, + { + "epoch": 0.1392644672796106, + "grad_norm": 3.65625, + "learning_rate": 0.01852252252252252, + "loss": 3.5543, + "mean_token_accuracy": 0.34860581159591675, + "num_tokens": 262430275.0, + "step": 515 + }, + { + "epoch": 0.13953488372093023, + "grad_norm": 3.1875, + "learning_rate": 0.01855855855855856, + "loss": 3.4128, + "mean_token_accuracy": 0.3753540515899658, + "num_tokens": 262910007.0, + "step": 516 + }, + { + "epoch": 0.13980530016224987, + "grad_norm": 3.1875, + "learning_rate": 0.018594594594594595, + "loss": 3.5616, + "mean_token_accuracy": 0.3793222904205322, + "num_tokens": 263370918.0, + "step": 517 + }, + { + "epoch": 0.1400757166035695, + "grad_norm": 3.5, + "learning_rate": 0.018630630630630633, + "loss": 3.4888, + "mean_token_accuracy": 0.38175860047340393, + "num_tokens": 263875838.0, + "step": 518 + }, + { + "epoch": 0.14034613304488913, + "grad_norm": 3.046875, + "learning_rate": 0.018666666666666668, + "loss": 3.3388, + "mean_token_accuracy": 0.36726218461990356, + "num_tokens": 264399912.0, + "step": 519 + }, + { + "epoch": 0.14061654948620878, + "grad_norm": 2.734375, + "learning_rate": 0.018702702702702703, + "loss": 3.3145, + "mean_token_accuracy": 0.38587486743927, + "num_tokens": 264907507.0, + "step": 520 + }, + { + "epoch": 0.1408869659275284, + "grad_norm": 3.328125, + "learning_rate": 0.018738738738738738, + "loss": 3.2991, + "mean_token_accuracy": 0.3828248381614685, + "num_tokens": 265419188.0, + "step": 521 + }, + { + "epoch": 0.14115738236884803, + "grad_norm": 3.8125, + "learning_rate": 0.018774774774774773, + "loss": 3.4879, + "mean_token_accuracy": 0.34920018911361694, + "num_tokens": 265943434.0, + "step": 522 + }, + { + "epoch": 0.14142779881016765, + "grad_norm": 2.9375, + "learning_rate": 0.01881081081081081, + "loss": 3.4318, + "mean_token_accuracy": 0.37612223625183105, + "num_tokens": 266467589.0, + "step": 523 + }, + { + "epoch": 0.1416982152514873, + "grad_norm": 3.78125, + "learning_rate": 0.018846846846846847, + "loss": 3.3674, + "mean_token_accuracy": 0.3382633924484253, + "num_tokens": 266991630.0, + "step": 524 + }, + { + "epoch": 0.14196863169280693, + "grad_norm": 2.671875, + "learning_rate": 0.018882882882882885, + "loss": 3.0473, + "mean_token_accuracy": 0.3889750838279724, + "num_tokens": 267515863.0, + "step": 525 + }, + { + "epoch": 0.14223904813412655, + "grad_norm": 2.75, + "learning_rate": 0.01891891891891892, + "loss": 3.4897, + "mean_token_accuracy": 0.3691534996032715, + "num_tokens": 268021312.0, + "step": 526 + }, + { + "epoch": 0.1425094645754462, + "grad_norm": 3.640625, + "learning_rate": 0.018954954954954955, + "loss": 3.357, + "mean_token_accuracy": 0.3722463846206665, + "num_tokens": 268545439.0, + "step": 527 + }, + { + "epoch": 0.1427798810167658, + "grad_norm": 3.375, + "learning_rate": 0.01899099099099099, + "loss": 3.2019, + "mean_token_accuracy": 0.36229681968688965, + "num_tokens": 269069492.0, + "step": 528 + }, + { + "epoch": 0.14305029745808545, + "grad_norm": 2.578125, + "learning_rate": 0.01902702702702703, + "loss": 3.3219, + "mean_token_accuracy": 0.37566617131233215, + "num_tokens": 269593671.0, + "step": 529 + }, + { + "epoch": 0.1433207138994051, + "grad_norm": 3.375, + "learning_rate": 0.019063063063063063, + "loss": 3.3206, + "mean_token_accuracy": 0.373921275138855, + "num_tokens": 270117891.0, + "step": 530 + }, + { + "epoch": 0.1435911303407247, + "grad_norm": 1.9453125, + "learning_rate": 0.0190990990990991, + "loss": 10.6805, + "mean_token_accuracy": 0.0, + "num_tokens": 270620055.0, + "step": 531 + }, + { + "epoch": 0.14386154678204435, + "grad_norm": 7.28125, + "learning_rate": 0.019135135135135137, + "loss": 3.8406, + "mean_token_accuracy": 0.32247796654701233, + "num_tokens": 271144309.0, + "step": 532 + }, + { + "epoch": 0.14413196322336397, + "grad_norm": 2.46875, + "learning_rate": 0.01917117117117117, + "loss": 3.7914, + "mean_token_accuracy": 0.4026600122451782, + "num_tokens": 271503735.0, + "step": 533 + }, + { + "epoch": 0.1444023796646836, + "grad_norm": 3.328125, + "learning_rate": 0.019207207207207207, + "loss": 3.8354, + "mean_token_accuracy": 0.31876271963119507, + "num_tokens": 272027934.0, + "step": 534 + }, + { + "epoch": 0.14467279610600325, + "grad_norm": 3.171875, + "learning_rate": 0.019243243243243245, + "loss": 3.5309, + "mean_token_accuracy": 0.3578000068664551, + "num_tokens": 272552163.0, + "step": 535 + }, + { + "epoch": 0.14494321254732287, + "grad_norm": 3.859375, + "learning_rate": 0.01927927927927928, + "loss": 3.5623, + "mean_token_accuracy": 0.3240276277065277, + "num_tokens": 273076416.0, + "step": 536 + }, + { + "epoch": 0.1452136289886425, + "grad_norm": 4.03125, + "learning_rate": 0.019315315315315315, + "loss": 3.4559, + "mean_token_accuracy": 0.38272762298583984, + "num_tokens": 273572626.0, + "step": 537 + }, + { + "epoch": 0.14548404542996216, + "grad_norm": 3.53125, + "learning_rate": 0.01935135135135135, + "loss": 3.5021, + "mean_token_accuracy": 0.3615418076515198, + "num_tokens": 274091281.0, + "step": 538 + }, + { + "epoch": 0.14575446187128177, + "grad_norm": 3.21875, + "learning_rate": 0.01938738738738739, + "loss": 3.3748, + "mean_token_accuracy": 0.3895407021045685, + "num_tokens": 274615512.0, + "step": 539 + }, + { + "epoch": 0.1460248783126014, + "grad_norm": 2.703125, + "learning_rate": 0.019423423423423423, + "loss": 3.3097, + "mean_token_accuracy": 0.3750055432319641, + "num_tokens": 275139760.0, + "step": 540 + }, + { + "epoch": 0.14629529475392103, + "grad_norm": 4.0, + "learning_rate": 0.019459459459459462, + "loss": 3.5566, + "mean_token_accuracy": 0.37413474917411804, + "num_tokens": 275658369.0, + "step": 541 + }, + { + "epoch": 0.14656571119524067, + "grad_norm": 5.375, + "learning_rate": 0.019495495495495497, + "loss": 3.8806, + "mean_token_accuracy": 0.3221978545188904, + "num_tokens": 276182628.0, + "step": 542 + }, + { + "epoch": 0.14683612763656032, + "grad_norm": 2.875, + "learning_rate": 0.019531531531531532, + "loss": 3.6535, + "mean_token_accuracy": 0.32684803009033203, + "num_tokens": 276706818.0, + "step": 543 + }, + { + "epoch": 0.14710654407787993, + "grad_norm": 3.203125, + "learning_rate": 0.019567567567567567, + "loss": 3.6101, + "mean_token_accuracy": 0.34379521012306213, + "num_tokens": 277231047.0, + "step": 544 + }, + { + "epoch": 0.14737696051919957, + "grad_norm": 3.515625, + "learning_rate": 0.019603603603603605, + "loss": 3.1857, + "mean_token_accuracy": 0.4109957218170166, + "num_tokens": 277691530.0, + "step": 545 + }, + { + "epoch": 0.1476473769605192, + "grad_norm": 3.953125, + "learning_rate": 0.01963963963963964, + "loss": 3.4266, + "mean_token_accuracy": 0.3519592881202698, + "num_tokens": 278215808.0, + "step": 546 + }, + { + "epoch": 0.14791779340183883, + "grad_norm": 2.625, + "learning_rate": 0.01967567567567568, + "loss": 3.4889, + "mean_token_accuracy": 0.36854130029678345, + "num_tokens": 278740055.0, + "step": 547 + }, + { + "epoch": 0.14818820984315847, + "grad_norm": 3.265625, + "learning_rate": 0.019711711711711714, + "loss": 3.5104, + "mean_token_accuracy": 0.340639591217041, + "num_tokens": 279264327.0, + "step": 548 + }, + { + "epoch": 0.1484586262844781, + "grad_norm": 2.78125, + "learning_rate": 0.01974774774774775, + "loss": 3.3333, + "mean_token_accuracy": 0.3800773322582245, + "num_tokens": 279788468.0, + "step": 549 + }, + { + "epoch": 0.14872904272579773, + "grad_norm": 2.515625, + "learning_rate": 0.019783783783783784, + "loss": 3.296, + "mean_token_accuracy": 0.3680174946784973, + "num_tokens": 280312675.0, + "step": 550 + }, + { + "epoch": 0.14899945916711735, + "grad_norm": 15.8125, + "learning_rate": 0.01981981981981982, + "loss": 14.351, + "mean_token_accuracy": 0.010890107601881027, + "num_tokens": 280784705.0, + "step": 551 + }, + { + "epoch": 0.149269875608437, + "grad_norm": 9.3125, + "learning_rate": 0.019855855855855857, + "loss": 3.9757, + "mean_token_accuracy": 0.29583653807640076, + "num_tokens": 281308923.0, + "step": 552 + }, + { + "epoch": 0.14954029204975663, + "grad_norm": 3.15625, + "learning_rate": 0.019891891891891892, + "loss": 3.627, + "mean_token_accuracy": 0.35536977648735046, + "num_tokens": 281833107.0, + "step": 553 + }, + { + "epoch": 0.14981070849107625, + "grad_norm": 3.375, + "learning_rate": 0.01992792792792793, + "loss": 3.5117, + "mean_token_accuracy": 0.36778706312179565, + "num_tokens": 282357377.0, + "step": 554 + }, + { + "epoch": 0.1500811249323959, + "grad_norm": 4.0625, + "learning_rate": 0.019963963963963965, + "loss": 3.6233, + "mean_token_accuracy": 0.35591116547584534, + "num_tokens": 282813511.0, + "step": 555 + }, + { + "epoch": 0.1503515413737155, + "grad_norm": 2.84375, + "learning_rate": 0.02, + "loss": 3.5518, + "mean_token_accuracy": 0.36988288164138794, + "num_tokens": 283337769.0, + "step": 556 + }, + { + "epoch": 0.15062195781503515, + "grad_norm": 3.703125, + "learning_rate": 0.019999999861926767, + "loss": 3.3585, + "mean_token_accuracy": 0.3284345269203186, + "num_tokens": 283861940.0, + "step": 557 + }, + { + "epoch": 0.1508923742563548, + "grad_norm": 3.765625, + "learning_rate": 0.01999999944770707, + "loss": 3.4562, + "mean_token_accuracy": 0.36466407775878906, + "num_tokens": 284386105.0, + "step": 558 + }, + { + "epoch": 0.1511627906976744, + "grad_norm": 3.453125, + "learning_rate": 0.019999998757340924, + "loss": 3.5088, + "mean_token_accuracy": 0.39410755038261414, + "num_tokens": 284831870.0, + "step": 559 + }, + { + "epoch": 0.15143320713899405, + "grad_norm": 3.015625, + "learning_rate": 0.019999997790828346, + "loss": 3.4157, + "mean_token_accuracy": 0.3394722640514374, + "num_tokens": 285317932.0, + "step": 560 + }, + { + "epoch": 0.1517036235803137, + "grad_norm": 4.9375, + "learning_rate": 0.01999999654816937, + "loss": 3.6582, + "mean_token_accuracy": 0.3511771559715271, + "num_tokens": 285842018.0, + "step": 561 + }, + { + "epoch": 0.1519740400216333, + "grad_norm": 3.015625, + "learning_rate": 0.01999999502936403, + "loss": 3.0469, + "mean_token_accuracy": 0.37768054008483887, + "num_tokens": 286366205.0, + "step": 562 + }, + { + "epoch": 0.15224445646295295, + "grad_norm": 2.96875, + "learning_rate": 0.019999993234412382, + "loss": 3.4459, + "mean_token_accuracy": 0.37635713815689087, + "num_tokens": 286890419.0, + "step": 563 + }, + { + "epoch": 0.15251487290427257, + "grad_norm": 4.875, + "learning_rate": 0.01999999116331447, + "loss": 3.5617, + "mean_token_accuracy": 0.37022674083709717, + "num_tokens": 287414532.0, + "step": 564 + }, + { + "epoch": 0.1527852893455922, + "grad_norm": 2.8125, + "learning_rate": 0.019999988816070364, + "loss": 3.2896, + "mean_token_accuracy": 0.3776531219482422, + "num_tokens": 287938811.0, + "step": 565 + }, + { + "epoch": 0.15305570578691186, + "grad_norm": 3.28125, + "learning_rate": 0.019999986192680133, + "loss": 3.4882, + "mean_token_accuracy": 0.3739572763442993, + "num_tokens": 288413565.0, + "step": 566 + }, + { + "epoch": 0.15332612222823147, + "grad_norm": 2.90625, + "learning_rate": 0.01999998329314386, + "loss": 3.4626, + "mean_token_accuracy": 0.34775257110595703, + "num_tokens": 288937781.0, + "step": 567 + }, + { + "epoch": 0.1535965386695511, + "grad_norm": 3.53125, + "learning_rate": 0.019999980117461628, + "loss": 3.5641, + "mean_token_accuracy": 0.37144988775253296, + "num_tokens": 289462055.0, + "step": 568 + }, + { + "epoch": 0.15386695511087073, + "grad_norm": 2.9375, + "learning_rate": 0.01999997666563354, + "loss": 3.4646, + "mean_token_accuracy": 0.4205299913883209, + "num_tokens": 289920919.0, + "step": 569 + }, + { + "epoch": 0.15413737155219037, + "grad_norm": 3.109375, + "learning_rate": 0.0199999729376597, + "loss": 3.5545, + "mean_token_accuracy": 0.3626474440097809, + "num_tokens": 290445163.0, + "step": 570 + }, + { + "epoch": 0.15440778799351002, + "grad_norm": 63.5, + "learning_rate": 0.019999968933540224, + "loss": 13.3521, + "mean_token_accuracy": 4.619535047822865e-06, + "num_tokens": 290969354.0, + "step": 571 + }, + { + "epoch": 0.15467820443482963, + "grad_norm": 12.6875, + "learning_rate": 0.019999964653275235, + "loss": 4.4349, + "mean_token_accuracy": 0.25647252798080444, + "num_tokens": 291470750.0, + "step": 572 + }, + { + "epoch": 0.15494862087614927, + "grad_norm": 2.578125, + "learning_rate": 0.019999960096864864, + "loss": 3.599, + "mean_token_accuracy": 0.3266468644142151, + "num_tokens": 291994974.0, + "step": 573 + }, + { + "epoch": 0.1552190373174689, + "grad_norm": 2.703125, + "learning_rate": 0.019999955264309248, + "loss": 3.0703, + "mean_token_accuracy": 0.41672560572624207, + "num_tokens": 292519240.0, + "step": 574 + }, + { + "epoch": 0.15548945375878853, + "grad_norm": 3.0625, + "learning_rate": 0.01999995015560854, + "loss": 3.4052, + "mean_token_accuracy": 0.40159571170806885, + "num_tokens": 292977941.0, + "step": 575 + }, + { + "epoch": 0.15575987020010817, + "grad_norm": 30.0, + "learning_rate": 0.01999994477076289, + "loss": 3.5744, + "mean_token_accuracy": 0.3385022282600403, + "num_tokens": 293502149.0, + "step": 576 + }, + { + "epoch": 0.1560302866414278, + "grad_norm": 5.03125, + "learning_rate": 0.01999993910977247, + "loss": 3.8772, + "mean_token_accuracy": 0.3243117928504944, + "num_tokens": 294026287.0, + "step": 577 + }, + { + "epoch": 0.15630070308274743, + "grad_norm": 2.5625, + "learning_rate": 0.01999993317263745, + "loss": 3.6852, + "mean_token_accuracy": 0.36047759652137756, + "num_tokens": 294548396.0, + "step": 578 + }, + { + "epoch": 0.15657111952406708, + "grad_norm": 4.3125, + "learning_rate": 0.019999926959358014, + "loss": 3.3516, + "mean_token_accuracy": 0.3618052005767822, + "num_tokens": 295072480.0, + "step": 579 + }, + { + "epoch": 0.1568415359653867, + "grad_norm": 2.265625, + "learning_rate": 0.019999920469934353, + "loss": 3.5266, + "mean_token_accuracy": 0.3525379002094269, + "num_tokens": 295596759.0, + "step": 580 + }, + { + "epoch": 0.15711195240670633, + "grad_norm": 5.65625, + "learning_rate": 0.01999991370436666, + "loss": 3.4075, + "mean_token_accuracy": 0.35776248574256897, + "num_tokens": 296121032.0, + "step": 581 + }, + { + "epoch": 0.15738236884802595, + "grad_norm": 3.03125, + "learning_rate": 0.019999906662655154, + "loss": 3.6971, + "mean_token_accuracy": 0.3498789072036743, + "num_tokens": 296645181.0, + "step": 582 + }, + { + "epoch": 0.1576527852893456, + "grad_norm": 3.84375, + "learning_rate": 0.01999989934480004, + "loss": 3.4118, + "mean_token_accuracy": 0.34961843490600586, + "num_tokens": 297169311.0, + "step": 583 + }, + { + "epoch": 0.15792320173066524, + "grad_norm": 3.140625, + "learning_rate": 0.01999989175080155, + "loss": 3.2683, + "mean_token_accuracy": 0.37982240319252014, + "num_tokens": 297693554.0, + "step": 584 + }, + { + "epoch": 0.15819361817198485, + "grad_norm": 24.25, + "learning_rate": 0.019999883880659913, + "loss": 3.9307, + "mean_token_accuracy": 0.30451536178588867, + "num_tokens": 298217834.0, + "step": 585 + }, + { + "epoch": 0.1584640346133045, + "grad_norm": 3.75, + "learning_rate": 0.019999875734375373, + "loss": 3.8945, + "mean_token_accuracy": 0.31562960147857666, + "num_tokens": 298742011.0, + "step": 586 + }, + { + "epoch": 0.1587344510546241, + "grad_norm": 18.375, + "learning_rate": 0.019999867311948173, + "loss": 3.8023, + "mean_token_accuracy": 0.327006995677948, + "num_tokens": 299266232.0, + "step": 587 + }, + { + "epoch": 0.15900486749594375, + "grad_norm": 2.9375, + "learning_rate": 0.019999858613378583, + "loss": 3.6381, + "mean_token_accuracy": 0.33788374066352844, + "num_tokens": 299744693.0, + "step": 588 + }, + { + "epoch": 0.1592752839372634, + "grad_norm": 2.171875, + "learning_rate": 0.019999849638666866, + "loss": 3.5201, + "mean_token_accuracy": 0.3404275178909302, + "num_tokens": 300268819.0, + "step": 589 + }, + { + "epoch": 0.159545700378583, + "grad_norm": 5.8125, + "learning_rate": 0.01999984038781329, + "loss": 3.9121, + "mean_token_accuracy": 0.33490097522735596, + "num_tokens": 300793018.0, + "step": 590 + }, + { + "epoch": 0.15981611681990265, + "grad_norm": 4.40625, + "learning_rate": 0.019999830860818146, + "loss": 11.8994, + "mean_token_accuracy": 0.0, + "num_tokens": 301317222.0, + "step": 591 + }, + { + "epoch": 0.16008653326122227, + "grad_norm": 7.15625, + "learning_rate": 0.01999982105768172, + "loss": 4.296, + "mean_token_accuracy": 0.26888078451156616, + "num_tokens": 301841500.0, + "step": 592 + }, + { + "epoch": 0.1603569497025419, + "grad_norm": 2.671875, + "learning_rate": 0.01999981097840432, + "loss": 3.7154, + "mean_token_accuracy": 0.3391442596912384, + "num_tokens": 302343960.0, + "step": 593 + }, + { + "epoch": 0.16062736614386156, + "grad_norm": 3.78125, + "learning_rate": 0.019999800622986254, + "loss": 3.8206, + "mean_token_accuracy": 0.33571481704711914, + "num_tokens": 302868193.0, + "step": 594 + }, + { + "epoch": 0.16089778258518117, + "grad_norm": 3.90625, + "learning_rate": 0.01999978999142784, + "loss": 3.8046, + "mean_token_accuracy": 0.3199080228805542, + "num_tokens": 303392446.0, + "step": 595 + }, + { + "epoch": 0.1611681990265008, + "grad_norm": 2.703125, + "learning_rate": 0.0199997790837294, + "loss": 3.6492, + "mean_token_accuracy": 0.36413174867630005, + "num_tokens": 303883872.0, + "step": 596 + }, + { + "epoch": 0.16143861546782046, + "grad_norm": 3.21875, + "learning_rate": 0.01999976789989127, + "loss": 3.7598, + "mean_token_accuracy": 0.3199944496154785, + "num_tokens": 304408139.0, + "step": 597 + }, + { + "epoch": 0.16170903190914007, + "grad_norm": 2.265625, + "learning_rate": 0.019999756439913793, + "loss": 3.4544, + "mean_token_accuracy": 0.3572637140750885, + "num_tokens": 304932276.0, + "step": 598 + }, + { + "epoch": 0.16197944835045971, + "grad_norm": 3.265625, + "learning_rate": 0.019999744703797326, + "loss": 3.4239, + "mean_token_accuracy": 0.34242549538612366, + "num_tokens": 305456512.0, + "step": 599 + }, + { + "epoch": 0.16224986479177933, + "grad_norm": 2.5625, + "learning_rate": 0.019999732691542222, + "loss": 3.5046, + "mean_token_accuracy": 0.35899215936660767, + "num_tokens": 305980647.0, + "step": 600 + }, + { + "epoch": 0.16252028123309897, + "grad_norm": 3.203125, + "learning_rate": 0.019999720403148857, + "loss": 3.6556, + "mean_token_accuracy": 0.3614926040172577, + "num_tokens": 306472792.0, + "step": 601 + }, + { + "epoch": 0.16279069767441862, + "grad_norm": 2.0, + "learning_rate": 0.019999707838617597, + "loss": 3.4171, + "mean_token_accuracy": 0.38094890117645264, + "num_tokens": 306987644.0, + "step": 602 + }, + { + "epoch": 0.16306111411573823, + "grad_norm": 3.3125, + "learning_rate": 0.019999694997948837, + "loss": 3.6625, + "mean_token_accuracy": 0.3606494069099426, + "num_tokens": 307470459.0, + "step": 603 + }, + { + "epoch": 0.16333153055705787, + "grad_norm": 2.703125, + "learning_rate": 0.01999968188114297, + "loss": 3.4258, + "mean_token_accuracy": 0.369662880897522, + "num_tokens": 307994734.0, + "step": 604 + }, + { + "epoch": 0.1636019469983775, + "grad_norm": 3.3125, + "learning_rate": 0.019999668488200395, + "loss": 3.6156, + "mean_token_accuracy": 0.3364231586456299, + "num_tokens": 308518849.0, + "step": 605 + }, + { + "epoch": 0.16387236343969713, + "grad_norm": 2.828125, + "learning_rate": 0.019999654819121523, + "loss": 3.546, + "mean_token_accuracy": 0.3630979061126709, + "num_tokens": 309019489.0, + "step": 606 + }, + { + "epoch": 0.16414277988101678, + "grad_norm": 4.9375, + "learning_rate": 0.019999640873906776, + "loss": 3.579, + "mean_token_accuracy": 0.37580934166908264, + "num_tokens": 309543647.0, + "step": 607 + }, + { + "epoch": 0.1644131963223364, + "grad_norm": 4.0625, + "learning_rate": 0.019999626652556585, + "loss": 3.4133, + "mean_token_accuracy": 0.3621683418750763, + "num_tokens": 310067771.0, + "step": 608 + }, + { + "epoch": 0.16468361276365603, + "grad_norm": 3.328125, + "learning_rate": 0.019999612155071376, + "loss": 3.4797, + "mean_token_accuracy": 0.37958380579948425, + "num_tokens": 310591951.0, + "step": 609 + }, + { + "epoch": 0.16495402920497565, + "grad_norm": 3.8125, + "learning_rate": 0.019999597381451604, + "loss": 3.3796, + "mean_token_accuracy": 0.3570502996444702, + "num_tokens": 311116211.0, + "step": 610 + }, + { + "epoch": 0.1652244456462953, + "grad_norm": 91.0, + "learning_rate": 0.019999582331697713, + "loss": 13.0473, + "mean_token_accuracy": 0.005615473724901676, + "num_tokens": 311640453.0, + "step": 611 + }, + { + "epoch": 0.16549486208761494, + "grad_norm": 10.125, + "learning_rate": 0.019999567005810175, + "loss": 4.1599, + "mean_token_accuracy": 0.2595703601837158, + "num_tokens": 312164673.0, + "step": 612 + }, + { + "epoch": 0.16576527852893455, + "grad_norm": 2.625, + "learning_rate": 0.019999551403789454, + "loss": 3.6783, + "mean_token_accuracy": 0.3454132378101349, + "num_tokens": 312688946.0, + "step": 613 + }, + { + "epoch": 0.1660356949702542, + "grad_norm": 2.5, + "learning_rate": 0.01999953552563603, + "loss": 3.4321, + "mean_token_accuracy": 0.36285439133644104, + "num_tokens": 313184276.0, + "step": 614 + }, + { + "epoch": 0.16630611141157384, + "grad_norm": 3.734375, + "learning_rate": 0.019999519371350388, + "loss": 3.4667, + "mean_token_accuracy": 0.36339396238327026, + "num_tokens": 313708456.0, + "step": 615 + }, + { + "epoch": 0.16657652785289345, + "grad_norm": 2.734375, + "learning_rate": 0.01999950294093303, + "loss": 3.3005, + "mean_token_accuracy": 0.36761632561683655, + "num_tokens": 314179848.0, + "step": 616 + }, + { + "epoch": 0.1668469442942131, + "grad_norm": 4.25, + "learning_rate": 0.019999486234384453, + "loss": 3.2054, + "mean_token_accuracy": 0.3850552439689636, + "num_tokens": 314704056.0, + "step": 617 + }, + { + "epoch": 0.1671173607355327, + "grad_norm": 2.96875, + "learning_rate": 0.019999469251705173, + "loss": 3.4974, + "mean_token_accuracy": 0.362815797328949, + "num_tokens": 315228332.0, + "step": 618 + }, + { + "epoch": 0.16738777717685235, + "grad_norm": 3.703125, + "learning_rate": 0.019999451992895708, + "loss": 3.3217, + "mean_token_accuracy": 0.38493818044662476, + "num_tokens": 315752553.0, + "step": 619 + }, + { + "epoch": 0.167658193618172, + "grad_norm": 3.328125, + "learning_rate": 0.019999434457956596, + "loss": 3.4122, + "mean_token_accuracy": 0.38078001141548157, + "num_tokens": 316276830.0, + "step": 620 + }, + { + "epoch": 0.1679286100594916, + "grad_norm": 3.4375, + "learning_rate": 0.019999416646888365, + "loss": 3.7131, + "mean_token_accuracy": 0.3762126564979553, + "num_tokens": 316746264.0, + "step": 621 + }, + { + "epoch": 0.16819902650081126, + "grad_norm": 3.53125, + "learning_rate": 0.01999939855969157, + "loss": 3.4071, + "mean_token_accuracy": 0.3646818697452545, + "num_tokens": 317214252.0, + "step": 622 + }, + { + "epoch": 0.16846944294213087, + "grad_norm": 2.609375, + "learning_rate": 0.019999380196366753, + "loss": 3.6706, + "mean_token_accuracy": 0.35944557189941406, + "num_tokens": 317738525.0, + "step": 623 + }, + { + "epoch": 0.1687398593834505, + "grad_norm": 4.21875, + "learning_rate": 0.019999361556914494, + "loss": 3.4789, + "mean_token_accuracy": 0.33989250659942627, + "num_tokens": 318262724.0, + "step": 624 + }, + { + "epoch": 0.16901027582477016, + "grad_norm": 2.59375, + "learning_rate": 0.019999342641335353, + "loss": 3.4351, + "mean_token_accuracy": 0.37276989221572876, + "num_tokens": 318786966.0, + "step": 625 + }, + { + "epoch": 0.16928069226608977, + "grad_norm": 3.609375, + "learning_rate": 0.01999932344962992, + "loss": 3.4907, + "mean_token_accuracy": 0.3640401363372803, + "num_tokens": 319311180.0, + "step": 626 + }, + { + "epoch": 0.16955110870740941, + "grad_norm": 2.28125, + "learning_rate": 0.019999303981798774, + "loss": 3.1502, + "mean_token_accuracy": 0.4000386595726013, + "num_tokens": 319835437.0, + "step": 627 + }, + { + "epoch": 0.16982152514872903, + "grad_norm": 2.796875, + "learning_rate": 0.019999284237842516, + "loss": 3.5048, + "mean_token_accuracy": 0.35980939865112305, + "num_tokens": 320341777.0, + "step": 628 + }, + { + "epoch": 0.17009194159004867, + "grad_norm": 2.484375, + "learning_rate": 0.019999264217761756, + "loss": 3.4366, + "mean_token_accuracy": 0.3879762887954712, + "num_tokens": 320834168.0, + "step": 629 + }, + { + "epoch": 0.17036235803136832, + "grad_norm": 3.265625, + "learning_rate": 0.0199992439215571, + "loss": 3.4888, + "mean_token_accuracy": 0.3376929759979248, + "num_tokens": 321358211.0, + "step": 630 + }, + { + "epoch": 0.17063277447268793, + "grad_norm": 1.609375, + "learning_rate": 0.019999223349229177, + "loss": 11.0875, + "mean_token_accuracy": 6.454778031184105e-06, + "num_tokens": 321882409.0, + "step": 631 + }, + { + "epoch": 0.17090319091400757, + "grad_norm": 9.4375, + "learning_rate": 0.01999920250077862, + "loss": 4.1696, + "mean_token_accuracy": 0.2830676734447479, + "num_tokens": 322406684.0, + "step": 632 + }, + { + "epoch": 0.17117360735532722, + "grad_norm": 3.6875, + "learning_rate": 0.01999918137620606, + "loss": 3.7634, + "mean_token_accuracy": 0.3384413421154022, + "num_tokens": 322930801.0, + "step": 633 + }, + { + "epoch": 0.17144402379664683, + "grad_norm": 4.125, + "learning_rate": 0.019999159975512156, + "loss": 3.5072, + "mean_token_accuracy": 0.36651960015296936, + "num_tokens": 323405039.0, + "step": 634 + }, + { + "epoch": 0.17171444023796648, + "grad_norm": 2.859375, + "learning_rate": 0.019999138298697556, + "loss": 3.5951, + "mean_token_accuracy": 0.3636297881603241, + "num_tokens": 323929194.0, + "step": 635 + }, + { + "epoch": 0.1719848566792861, + "grad_norm": 2.609375, + "learning_rate": 0.019999116345762926, + "loss": 3.6496, + "mean_token_accuracy": 0.3517613410949707, + "num_tokens": 324453443.0, + "step": 636 + }, + { + "epoch": 0.17225527312060573, + "grad_norm": 2.59375, + "learning_rate": 0.019999094116708943, + "loss": 3.5388, + "mean_token_accuracy": 0.3364854156970978, + "num_tokens": 324977561.0, + "step": 637 + }, + { + "epoch": 0.17252568956192538, + "grad_norm": 2.53125, + "learning_rate": 0.01999907161153629, + "loss": 3.3829, + "mean_token_accuracy": 0.3732394874095917, + "num_tokens": 325496390.0, + "step": 638 + }, + { + "epoch": 0.172796106003245, + "grad_norm": 3.546875, + "learning_rate": 0.019999048830245655, + "loss": 3.5323, + "mean_token_accuracy": 0.3369391858577728, + "num_tokens": 326020659.0, + "step": 639 + }, + { + "epoch": 0.17306652244456464, + "grad_norm": 3.109375, + "learning_rate": 0.01999902577283774, + "loss": 3.3041, + "mean_token_accuracy": 0.36319097876548767, + "num_tokens": 326523978.0, + "step": 640 + }, + { + "epoch": 0.17333693888588425, + "grad_norm": 2.828125, + "learning_rate": 0.019999002439313244, + "loss": 3.4752, + "mean_token_accuracy": 0.36799123883247375, + "num_tokens": 327048104.0, + "step": 641 + }, + { + "epoch": 0.1736073553272039, + "grad_norm": 4.0625, + "learning_rate": 0.01999897882967289, + "loss": 3.2442, + "mean_token_accuracy": 0.3637692928314209, + "num_tokens": 327572239.0, + "step": 642 + }, + { + "epoch": 0.17387777176852354, + "grad_norm": 2.9375, + "learning_rate": 0.019998954943917405, + "loss": 3.4951, + "mean_token_accuracy": 0.35066884756088257, + "num_tokens": 328075910.0, + "step": 643 + }, + { + "epoch": 0.17414818820984315, + "grad_norm": 2.859375, + "learning_rate": 0.019998930782047515, + "loss": 3.4115, + "mean_token_accuracy": 0.360080361366272, + "num_tokens": 328596445.0, + "step": 644 + }, + { + "epoch": 0.1744186046511628, + "grad_norm": 2.734375, + "learning_rate": 0.01999890634406397, + "loss": 3.3893, + "mean_token_accuracy": 0.3680708706378937, + "num_tokens": 329120694.0, + "step": 645 + }, + { + "epoch": 0.1746890210924824, + "grad_norm": 3.28125, + "learning_rate": 0.01999888162996751, + "loss": 3.309, + "mean_token_accuracy": 0.38795459270477295, + "num_tokens": 329607788.0, + "step": 646 + }, + { + "epoch": 0.17495943753380205, + "grad_norm": 2.875, + "learning_rate": 0.019998856639758894, + "loss": 3.1941, + "mean_token_accuracy": 0.3903390169143677, + "num_tokens": 330132020.0, + "step": 647 + }, + { + "epoch": 0.1752298539751217, + "grad_norm": 4.03125, + "learning_rate": 0.019998831373438896, + "loss": 3.5954, + "mean_token_accuracy": 0.3456246256828308, + "num_tokens": 330656183.0, + "step": 648 + }, + { + "epoch": 0.1755002704164413, + "grad_norm": 4.21875, + "learning_rate": 0.019998805831008288, + "loss": 3.6621, + "mean_token_accuracy": 0.3678470849990845, + "num_tokens": 331180454.0, + "step": 649 + }, + { + "epoch": 0.17577068685776095, + "grad_norm": 3.5625, + "learning_rate": 0.01999878001246785, + "loss": 3.3963, + "mean_token_accuracy": 0.3453962206840515, + "num_tokens": 331704651.0, + "step": 650 + }, + { + "epoch": 0.1760411032990806, + "grad_norm": 7.0625, + "learning_rate": 0.019998753917818385, + "loss": 10.2294, + "mean_token_accuracy": 0.007710705511271954, + "num_tokens": 332228852.0, + "step": 651 + }, + { + "epoch": 0.1763115197404002, + "grad_norm": 11.1875, + "learning_rate": 0.01999872754706068, + "loss": 4.1175, + "mean_token_accuracy": 0.3112794756889343, + "num_tokens": 332708816.0, + "step": 652 + }, + { + "epoch": 0.17658193618171986, + "grad_norm": 7.46875, + "learning_rate": 0.01999870090019555, + "loss": 3.1814, + "mean_token_accuracy": 0.41277313232421875, + "num_tokens": 333176191.0, + "step": 653 + }, + { + "epoch": 0.17685235262303947, + "grad_norm": 2.0, + "learning_rate": 0.019998673977223817, + "loss": 3.7707, + "mean_token_accuracy": 0.3606916666030884, + "num_tokens": 333607561.0, + "step": 654 + }, + { + "epoch": 0.17712276906435911, + "grad_norm": 3.203125, + "learning_rate": 0.0199986467781463, + "loss": 3.5317, + "mean_token_accuracy": 0.3352765440940857, + "num_tokens": 334080121.0, + "step": 655 + }, + { + "epoch": 0.17739318550567876, + "grad_norm": 3.0625, + "learning_rate": 0.01999861930296384, + "loss": 3.5954, + "mean_token_accuracy": 0.3502906560897827, + "num_tokens": 334604363.0, + "step": 656 + }, + { + "epoch": 0.17766360194699837, + "grad_norm": 23.125, + "learning_rate": 0.019998591551677273, + "loss": 3.8738, + "mean_token_accuracy": 0.3137998580932617, + "num_tokens": 335128583.0, + "step": 657 + }, + { + "epoch": 0.17793401838831802, + "grad_norm": 3.859375, + "learning_rate": 0.019998563524287454, + "loss": 3.8606, + "mean_token_accuracy": 0.3210720419883728, + "num_tokens": 335652796.0, + "step": 658 + }, + { + "epoch": 0.17820443482963763, + "grad_norm": 2.578125, + "learning_rate": 0.019998535220795245, + "loss": 3.3812, + "mean_token_accuracy": 0.3502247929573059, + "num_tokens": 336177015.0, + "step": 659 + }, + { + "epoch": 0.17847485127095727, + "grad_norm": 2.859375, + "learning_rate": 0.01999850664120151, + "loss": 3.4592, + "mean_token_accuracy": 0.34144461154937744, + "num_tokens": 336701285.0, + "step": 660 + }, + { + "epoch": 0.17874526771227692, + "grad_norm": 2.59375, + "learning_rate": 0.01999847778550713, + "loss": 3.3659, + "mean_token_accuracy": 0.3650211989879608, + "num_tokens": 337225559.0, + "step": 661 + }, + { + "epoch": 0.17901568415359653, + "grad_norm": 3.59375, + "learning_rate": 0.01999844865371299, + "loss": 3.66, + "mean_token_accuracy": 0.3548136055469513, + "num_tokens": 337749771.0, + "step": 662 + }, + { + "epoch": 0.17928610059491618, + "grad_norm": 4.90625, + "learning_rate": 0.01999841924581998, + "loss": 3.4212, + "mean_token_accuracy": 0.37650787830352783, + "num_tokens": 338270800.0, + "step": 663 + }, + { + "epoch": 0.1795565170362358, + "grad_norm": 3.3125, + "learning_rate": 0.019998389561829005, + "loss": 3.4201, + "mean_token_accuracy": 0.3602573871612549, + "num_tokens": 338795066.0, + "step": 664 + }, + { + "epoch": 0.17982693347755543, + "grad_norm": 4.375, + "learning_rate": 0.01999835960174098, + "loss": 3.534, + "mean_token_accuracy": 0.3564205765724182, + "num_tokens": 339319243.0, + "step": 665 + }, + { + "epoch": 0.18009734991887508, + "grad_norm": 3.03125, + "learning_rate": 0.019998329365556816, + "loss": 3.3531, + "mean_token_accuracy": 0.3622100353240967, + "num_tokens": 339843529.0, + "step": 666 + }, + { + "epoch": 0.1803677663601947, + "grad_norm": 3.53125, + "learning_rate": 0.019998298853277443, + "loss": 3.4789, + "mean_token_accuracy": 0.34086233377456665, + "num_tokens": 340367713.0, + "step": 667 + }, + { + "epoch": 0.18063818280151434, + "grad_norm": 3.359375, + "learning_rate": 0.0199982680649038, + "loss": 3.5894, + "mean_token_accuracy": 0.3566935658454895, + "num_tokens": 340891952.0, + "step": 668 + }, + { + "epoch": 0.18090859924283395, + "grad_norm": 3.234375, + "learning_rate": 0.019998237000436833, + "loss": 3.3569, + "mean_token_accuracy": 0.3591189384460449, + "num_tokens": 341416181.0, + "step": 669 + }, + { + "epoch": 0.1811790156841536, + "grad_norm": 2.5625, + "learning_rate": 0.019998205659877494, + "loss": 3.3807, + "mean_token_accuracy": 0.37226226925849915, + "num_tokens": 341940344.0, + "step": 670 + }, + { + "epoch": 0.18144943212547324, + "grad_norm": 436.0, + "learning_rate": 0.019998174043226742, + "loss": 17.8781, + "mean_token_accuracy": 0.0005690156249329448, + "num_tokens": 342464496.0, + "step": 671 + }, + { + "epoch": 0.18171984856679285, + "grad_norm": 9.3125, + "learning_rate": 0.019998142150485544, + "loss": 4.2891, + "mean_token_accuracy": 0.2662872076034546, + "num_tokens": 342988644.0, + "step": 672 + }, + { + "epoch": 0.1819902650081125, + "grad_norm": 2.90625, + "learning_rate": 0.01999810998165489, + "loss": 3.754, + "mean_token_accuracy": 0.3322259187698364, + "num_tokens": 343512923.0, + "step": 673 + }, + { + "epoch": 0.18226068144943214, + "grad_norm": 3.109375, + "learning_rate": 0.019998077536735756, + "loss": 3.3947, + "mean_token_accuracy": 0.37369951605796814, + "num_tokens": 344037081.0, + "step": 674 + }, + { + "epoch": 0.18253109789075175, + "grad_norm": 3.421875, + "learning_rate": 0.019998044815729143, + "loss": 3.5171, + "mean_token_accuracy": 0.40078163146972656, + "num_tokens": 344425036.0, + "step": 675 + }, + { + "epoch": 0.1828015143320714, + "grad_norm": 2.59375, + "learning_rate": 0.01999801181863605, + "loss": 3.4888, + "mean_token_accuracy": 0.3603626787662506, + "num_tokens": 344949310.0, + "step": 676 + }, + { + "epoch": 0.183071930773391, + "grad_norm": 3.53125, + "learning_rate": 0.019997978545457495, + "loss": 3.6834, + "mean_token_accuracy": 0.3357067406177521, + "num_tokens": 345429154.0, + "step": 677 + }, + { + "epoch": 0.18334234721471065, + "grad_norm": 2.515625, + "learning_rate": 0.0199979449961945, + "loss": 3.5944, + "mean_token_accuracy": 0.35063639283180237, + "num_tokens": 345953410.0, + "step": 678 + }, + { + "epoch": 0.1836127636560303, + "grad_norm": 3.296875, + "learning_rate": 0.01999791117084809, + "loss": 3.6883, + "mean_token_accuracy": 0.36915406584739685, + "num_tokens": 346447932.0, + "step": 679 + }, + { + "epoch": 0.1838831800973499, + "grad_norm": 2.96875, + "learning_rate": 0.0199978770694193, + "loss": 3.2547, + "mean_token_accuracy": 0.3898846507072449, + "num_tokens": 346972070.0, + "step": 680 + }, + { + "epoch": 0.18415359653866956, + "grad_norm": 3.09375, + "learning_rate": 0.019997842691909185, + "loss": 3.4354, + "mean_token_accuracy": 0.36504650115966797, + "num_tokens": 347496273.0, + "step": 681 + }, + { + "epoch": 0.18442401297998917, + "grad_norm": 3.875, + "learning_rate": 0.01999780803831879, + "loss": 3.5507, + "mean_token_accuracy": 0.35334616899490356, + "num_tokens": 348020559.0, + "step": 682 + }, + { + "epoch": 0.18469442942130881, + "grad_norm": 3.265625, + "learning_rate": 0.01999777310864919, + "loss": 3.0553, + "mean_token_accuracy": 0.38048309087753296, + "num_tokens": 348544705.0, + "step": 683 + }, + { + "epoch": 0.18496484586262846, + "grad_norm": 2.65625, + "learning_rate": 0.019997737902901446, + "loss": 3.0921, + "mean_token_accuracy": 0.39119765162467957, + "num_tokens": 349068916.0, + "step": 684 + }, + { + "epoch": 0.18523526230394807, + "grad_norm": 3.171875, + "learning_rate": 0.01999770242107665, + "loss": 3.3998, + "mean_token_accuracy": 0.3828418254852295, + "num_tokens": 349593075.0, + "step": 685 + }, + { + "epoch": 0.18550567874526772, + "grad_norm": 3.359375, + "learning_rate": 0.019997666663175873, + "loss": 3.3755, + "mean_token_accuracy": 0.3653077483177185, + "num_tokens": 350093871.0, + "step": 686 + }, + { + "epoch": 0.18577609518658733, + "grad_norm": 2.609375, + "learning_rate": 0.01999763062920023, + "loss": 3.3615, + "mean_token_accuracy": 0.39171209931373596, + "num_tokens": 350564464.0, + "step": 687 + }, + { + "epoch": 0.18604651162790697, + "grad_norm": 3.359375, + "learning_rate": 0.019997594319150814, + "loss": 3.4053, + "mean_token_accuracy": 0.3822007477283478, + "num_tokens": 351025320.0, + "step": 688 + }, + { + "epoch": 0.18631692806922662, + "grad_norm": 2.34375, + "learning_rate": 0.019997557733028744, + "loss": 3.2641, + "mean_token_accuracy": 0.3602442741394043, + "num_tokens": 351549434.0, + "step": 689 + }, + { + "epoch": 0.18658734451054623, + "grad_norm": 3.046875, + "learning_rate": 0.019997520870835145, + "loss": 3.3893, + "mean_token_accuracy": 0.37803301215171814, + "num_tokens": 352073625.0, + "step": 690 + }, + { + "epoch": 0.18685776095186588, + "grad_norm": 87.5, + "learning_rate": 0.019997483732571146, + "loss": 12.1659, + "mean_token_accuracy": 0.012573350220918655, + "num_tokens": 352597735.0, + "step": 691 + }, + { + "epoch": 0.18712817739318552, + "grad_norm": 10.3125, + "learning_rate": 0.019997446318237883, + "loss": 4.3232, + "mean_token_accuracy": 0.265516459941864, + "num_tokens": 353121933.0, + "step": 692 + }, + { + "epoch": 0.18739859383450513, + "grad_norm": 2.09375, + "learning_rate": 0.01999740862783651, + "loss": 3.2146, + "mean_token_accuracy": 0.4051517844200134, + "num_tokens": 353611461.0, + "step": 693 + }, + { + "epoch": 0.18766901027582478, + "grad_norm": 2.65625, + "learning_rate": 0.01999737066136818, + "loss": 3.3307, + "mean_token_accuracy": 0.3728785514831543, + "num_tokens": 354119427.0, + "step": 694 + }, + { + "epoch": 0.1879394267171444, + "grad_norm": 4.5, + "learning_rate": 0.01999733241883406, + "loss": 3.5422, + "mean_token_accuracy": 0.33502131700515747, + "num_tokens": 354641531.0, + "step": 695 + }, + { + "epoch": 0.18820984315846404, + "grad_norm": 3.1875, + "learning_rate": 0.01999729390023532, + "loss": 3.4941, + "mean_token_accuracy": 0.3625836968421936, + "num_tokens": 355137096.0, + "step": 696 + }, + { + "epoch": 0.18848025959978368, + "grad_norm": 5.03125, + "learning_rate": 0.019997255105573146, + "loss": 3.6441, + "mean_token_accuracy": 0.3321285545825958, + "num_tokens": 355661367.0, + "step": 697 + }, + { + "epoch": 0.1887506760411033, + "grad_norm": 2.46875, + "learning_rate": 0.019997216034848723, + "loss": 3.3466, + "mean_token_accuracy": 0.39377856254577637, + "num_tokens": 356185642.0, + "step": 698 + }, + { + "epoch": 0.18902109248242294, + "grad_norm": 3.796875, + "learning_rate": 0.019997176688063254, + "loss": 3.3504, + "mean_token_accuracy": 0.3512175679206848, + "num_tokens": 356709802.0, + "step": 699 + }, + { + "epoch": 0.18929150892374255, + "grad_norm": 2.75, + "learning_rate": 0.019997137065217944, + "loss": 3.3092, + "mean_token_accuracy": 0.43340539932250977, + "num_tokens": 357145747.0, + "step": 700 + }, + { + "epoch": 0.1895619253650622, + "grad_norm": 2.859375, + "learning_rate": 0.019997097166314017, + "loss": 3.4787, + "mean_token_accuracy": 0.367072194814682, + "num_tokens": 357670025.0, + "step": 701 + }, + { + "epoch": 0.18983234180638184, + "grad_norm": 2.640625, + "learning_rate": 0.019997056991352683, + "loss": 3.4552, + "mean_token_accuracy": 0.35886234045028687, + "num_tokens": 358194288.0, + "step": 702 + }, + { + "epoch": 0.19010275824770145, + "grad_norm": 3.296875, + "learning_rate": 0.019997016540335182, + "loss": 3.3749, + "mean_token_accuracy": 0.3741893172264099, + "num_tokens": 358700668.0, + "step": 703 + }, + { + "epoch": 0.1903731746890211, + "grad_norm": 3.234375, + "learning_rate": 0.01999697581326276, + "loss": 3.269, + "mean_token_accuracy": 0.37154150009155273, + "num_tokens": 359163531.0, + "step": 704 + }, + { + "epoch": 0.1906435911303407, + "grad_norm": 3.265625, + "learning_rate": 0.019996934810136657, + "loss": 3.2345, + "mean_token_accuracy": 0.3757990002632141, + "num_tokens": 359687688.0, + "step": 705 + }, + { + "epoch": 0.19091400757166035, + "grad_norm": 3.296875, + "learning_rate": 0.019996893530958137, + "loss": 3.3909, + "mean_token_accuracy": 0.3836636245250702, + "num_tokens": 360159230.0, + "step": 706 + }, + { + "epoch": 0.19118442401298, + "grad_norm": 2.6875, + "learning_rate": 0.019996851975728466, + "loss": 3.3011, + "mean_token_accuracy": 0.38602834939956665, + "num_tokens": 360659859.0, + "step": 707 + }, + { + "epoch": 0.1914548404542996, + "grad_norm": 3.4375, + "learning_rate": 0.01999681014444892, + "loss": 3.6263, + "mean_token_accuracy": 0.3662223517894745, + "num_tokens": 361184099.0, + "step": 708 + }, + { + "epoch": 0.19172525689561926, + "grad_norm": 3.53125, + "learning_rate": 0.019996768037120777, + "loss": 3.6, + "mean_token_accuracy": 0.35181093215942383, + "num_tokens": 361708357.0, + "step": 709 + }, + { + "epoch": 0.1919956733369389, + "grad_norm": 2.875, + "learning_rate": 0.019996725653745336, + "loss": 3.4938, + "mean_token_accuracy": 0.37946125864982605, + "num_tokens": 362232628.0, + "step": 710 + }, + { + "epoch": 0.19226608977825851, + "grad_norm": 64.5, + "learning_rate": 0.019996682994323892, + "loss": 11.0509, + "mean_token_accuracy": 0.0036250066477805376, + "num_tokens": 362756743.0, + "step": 711 + }, + { + "epoch": 0.19253650621957816, + "grad_norm": 9.125, + "learning_rate": 0.01999664005885776, + "loss": 4.2884, + "mean_token_accuracy": 0.2532920241355896, + "num_tokens": 363269756.0, + "step": 712 + }, + { + "epoch": 0.19280692266089777, + "grad_norm": 2.921875, + "learning_rate": 0.01999659684734825, + "loss": 3.7704, + "mean_token_accuracy": 0.3371591567993164, + "num_tokens": 363793962.0, + "step": 713 + }, + { + "epoch": 0.19307733910221742, + "grad_norm": 3.4375, + "learning_rate": 0.01999655335979669, + "loss": 3.62, + "mean_token_accuracy": 0.34259700775146484, + "num_tokens": 364287594.0, + "step": 714 + }, + { + "epoch": 0.19334775554353706, + "grad_norm": 3.375, + "learning_rate": 0.01999650959620442, + "loss": 3.6161, + "mean_token_accuracy": 0.3433968722820282, + "num_tokens": 364811812.0, + "step": 715 + }, + { + "epoch": 0.19361817198485667, + "grad_norm": 3.4375, + "learning_rate": 0.019996465556572775, + "loss": 3.4508, + "mean_token_accuracy": 0.3525802493095398, + "num_tokens": 365336069.0, + "step": 716 + }, + { + "epoch": 0.19388858842617632, + "grad_norm": 2.59375, + "learning_rate": 0.01999642124090311, + "loss": 3.5064, + "mean_token_accuracy": 0.35501688718795776, + "num_tokens": 365842643.0, + "step": 717 + }, + { + "epoch": 0.19415900486749593, + "grad_norm": 3.171875, + "learning_rate": 0.019996376649196786, + "loss": 3.0987, + "mean_token_accuracy": 0.38269081711769104, + "num_tokens": 366340445.0, + "step": 718 + }, + { + "epoch": 0.19442942130881558, + "grad_norm": 3.375, + "learning_rate": 0.019996331781455174, + "loss": 3.5254, + "mean_token_accuracy": 0.34571373462677, + "num_tokens": 366864657.0, + "step": 719 + }, + { + "epoch": 0.19469983775013522, + "grad_norm": 3.703125, + "learning_rate": 0.01999628663767964, + "loss": 3.7485, + "mean_token_accuracy": 0.3581917881965637, + "num_tokens": 367317229.0, + "step": 720 + }, + { + "epoch": 0.19497025419145483, + "grad_norm": 2.390625, + "learning_rate": 0.01999624121787158, + "loss": 3.4686, + "mean_token_accuracy": 0.38515064120292664, + "num_tokens": 367729856.0, + "step": 721 + }, + { + "epoch": 0.19524067063277448, + "grad_norm": 2.828125, + "learning_rate": 0.01999619552203238, + "loss": 3.4084, + "mean_token_accuracy": 0.3640713095664978, + "num_tokens": 368254130.0, + "step": 722 + }, + { + "epoch": 0.1955110870740941, + "grad_norm": 2.40625, + "learning_rate": 0.019996149550163447, + "loss": 3.093, + "mean_token_accuracy": 0.3804783821105957, + "num_tokens": 368778318.0, + "step": 723 + }, + { + "epoch": 0.19578150351541374, + "grad_norm": 3.03125, + "learning_rate": 0.019996103302266185, + "loss": 3.4154, + "mean_token_accuracy": 0.3606516718864441, + "num_tokens": 369302442.0, + "step": 724 + }, + { + "epoch": 0.19605191995673338, + "grad_norm": 2.984375, + "learning_rate": 0.019996056778342028, + "loss": 3.4979, + "mean_token_accuracy": 0.39441949129104614, + "num_tokens": 369766708.0, + "step": 725 + }, + { + "epoch": 0.196322336398053, + "grad_norm": 3.015625, + "learning_rate": 0.019996009978392383, + "loss": 3.4326, + "mean_token_accuracy": 0.3579551875591278, + "num_tokens": 370290981.0, + "step": 726 + }, + { + "epoch": 0.19659275283937264, + "grad_norm": 3.828125, + "learning_rate": 0.0199959629024187, + "loss": 3.526, + "mean_token_accuracy": 0.3811362683773041, + "num_tokens": 370815256.0, + "step": 727 + }, + { + "epoch": 0.19686316928069228, + "grad_norm": 2.953125, + "learning_rate": 0.019995915550422422, + "loss": 3.3752, + "mean_token_accuracy": 0.36841627955436707, + "num_tokens": 371303021.0, + "step": 728 + }, + { + "epoch": 0.1971335857220119, + "grad_norm": 2.8125, + "learning_rate": 0.019995867922405, + "loss": 3.5652, + "mean_token_accuracy": 0.37278109788894653, + "num_tokens": 371827186.0, + "step": 729 + }, + { + "epoch": 0.19740400216333154, + "grad_norm": 2.515625, + "learning_rate": 0.01999582001836789, + "loss": 3.1787, + "mean_token_accuracy": 0.3898010849952698, + "num_tokens": 372302047.0, + "step": 730 + }, + { + "epoch": 0.19767441860465115, + "grad_norm": 63.5, + "learning_rate": 0.019995771838312575, + "loss": 11.2562, + "mean_token_accuracy": 0.0038501920644193888, + "num_tokens": 372826268.0, + "step": 731 + }, + { + "epoch": 0.1979448350459708, + "grad_norm": 5.21875, + "learning_rate": 0.019995723382240517, + "loss": 3.9391, + "mean_token_accuracy": 0.31866008043289185, + "num_tokens": 373350446.0, + "step": 732 + }, + { + "epoch": 0.19821525148729044, + "grad_norm": 6.125, + "learning_rate": 0.019995674650153214, + "loss": 3.4204, + "mean_token_accuracy": 0.38473984599113464, + "num_tokens": 373874711.0, + "step": 733 + }, + { + "epoch": 0.19848566792861005, + "grad_norm": 2.421875, + "learning_rate": 0.01999562564205216, + "loss": 3.3266, + "mean_token_accuracy": 0.3699533939361572, + "num_tokens": 374398942.0, + "step": 734 + }, + { + "epoch": 0.1987560843699297, + "grad_norm": 4.0, + "learning_rate": 0.019995576357938853, + "loss": 3.5936, + "mean_token_accuracy": 0.3529031574726105, + "num_tokens": 374863891.0, + "step": 735 + }, + { + "epoch": 0.1990265008112493, + "grad_norm": 3.578125, + "learning_rate": 0.019995526797814814, + "loss": 3.644, + "mean_token_accuracy": 0.35104626417160034, + "num_tokens": 375388089.0, + "step": 736 + }, + { + "epoch": 0.19929691725256896, + "grad_norm": 3.734375, + "learning_rate": 0.019995476961681555, + "loss": 3.5727, + "mean_token_accuracy": 0.3588973581790924, + "num_tokens": 375912370.0, + "step": 737 + }, + { + "epoch": 0.1995673336938886, + "grad_norm": 2.6875, + "learning_rate": 0.01999542684954061, + "loss": 3.2435, + "mean_token_accuracy": 0.38136762380599976, + "num_tokens": 376436580.0, + "step": 738 + }, + { + "epoch": 0.19983775013520821, + "grad_norm": 4.21875, + "learning_rate": 0.019995376461393516, + "loss": 3.3369, + "mean_token_accuracy": 0.3754657506942749, + "num_tokens": 376930306.0, + "step": 739 + }, + { + "epoch": 0.20010816657652786, + "grad_norm": 4.0, + "learning_rate": 0.01999532579724182, + "loss": 3.5773, + "mean_token_accuracy": 0.36906248331069946, + "num_tokens": 377395960.0, + "step": 740 + }, + { + "epoch": 0.20037858301784747, + "grad_norm": 2.90625, + "learning_rate": 0.01999527485708707, + "loss": 3.475, + "mean_token_accuracy": 0.3558570444583893, + "num_tokens": 377917188.0, + "step": 741 + }, + { + "epoch": 0.20064899945916712, + "grad_norm": 2.859375, + "learning_rate": 0.01999522364093084, + "loss": 3.1089, + "mean_token_accuracy": 0.3960534930229187, + "num_tokens": 378385374.0, + "step": 742 + }, + { + "epoch": 0.20091941590048676, + "grad_norm": 3.1875, + "learning_rate": 0.019995172148774693, + "loss": 3.4881, + "mean_token_accuracy": 0.35850095748901367, + "num_tokens": 378909647.0, + "step": 743 + }, + { + "epoch": 0.20118983234180637, + "grad_norm": 3.796875, + "learning_rate": 0.019995120380620207, + "loss": 3.2882, + "mean_token_accuracy": 0.3618384897708893, + "num_tokens": 379427259.0, + "step": 744 + }, + { + "epoch": 0.20146024878312602, + "grad_norm": 2.765625, + "learning_rate": 0.01999506833646898, + "loss": 3.1817, + "mean_token_accuracy": 0.3883522152900696, + "num_tokens": 379921272.0, + "step": 745 + }, + { + "epoch": 0.20173066522444566, + "grad_norm": 4.875, + "learning_rate": 0.019995016016322604, + "loss": 3.6277, + "mean_token_accuracy": 0.34182223677635193, + "num_tokens": 380445413.0, + "step": 746 + }, + { + "epoch": 0.20200108166576528, + "grad_norm": 4.90625, + "learning_rate": 0.01999496342018268, + "loss": 3.5879, + "mean_token_accuracy": 0.36212143301963806, + "num_tokens": 380969533.0, + "step": 747 + }, + { + "epoch": 0.20227149810708492, + "grad_norm": 2.703125, + "learning_rate": 0.019994910548050828, + "loss": 3.6286, + "mean_token_accuracy": 0.355055570602417, + "num_tokens": 381493803.0, + "step": 748 + }, + { + "epoch": 0.20254191454840453, + "grad_norm": 3.171875, + "learning_rate": 0.019994857399928668, + "loss": 3.1917, + "mean_token_accuracy": 0.37227746844291687, + "num_tokens": 382018023.0, + "step": 749 + }, + { + "epoch": 0.20281233098972418, + "grad_norm": 2.25, + "learning_rate": 0.01999480397581783, + "loss": 3.4345, + "mean_token_accuracy": 0.36375880241394043, + "num_tokens": 382520835.0, + "step": 750 + }, + { + "epoch": 0.20308274743104382, + "grad_norm": 55.25, + "learning_rate": 0.019994750275719956, + "loss": 11.2012, + "mean_token_accuracy": 0.010286267846822739, + "num_tokens": 383045013.0, + "step": 751 + }, + { + "epoch": 0.20335316387236343, + "grad_norm": 15.1875, + "learning_rate": 0.01999469629963669, + "loss": 4.4735, + "mean_token_accuracy": 0.2598966956138611, + "num_tokens": 383569213.0, + "step": 752 + }, + { + "epoch": 0.20362358031368308, + "grad_norm": 3.109375, + "learning_rate": 0.01999464204756969, + "loss": 3.5865, + "mean_token_accuracy": 0.33294543623924255, + "num_tokens": 384093436.0, + "step": 753 + }, + { + "epoch": 0.2038939967550027, + "grad_norm": 2.4375, + "learning_rate": 0.019994587519520626, + "loss": 3.5387, + "mean_token_accuracy": 0.3560386896133423, + "num_tokens": 384615516.0, + "step": 754 + }, + { + "epoch": 0.20416441319632234, + "grad_norm": 3.515625, + "learning_rate": 0.019994532715491155, + "loss": 3.4349, + "mean_token_accuracy": 0.3407652974128723, + "num_tokens": 385139754.0, + "step": 755 + }, + { + "epoch": 0.20443482963764198, + "grad_norm": 2.4375, + "learning_rate": 0.019994477635482975, + "loss": 3.5155, + "mean_token_accuracy": 0.3612153232097626, + "num_tokens": 385663948.0, + "step": 756 + }, + { + "epoch": 0.2047052460789616, + "grad_norm": 3.421875, + "learning_rate": 0.01999442227949777, + "loss": 3.5199, + "mean_token_accuracy": 0.35141342878341675, + "num_tokens": 386188215.0, + "step": 757 + }, + { + "epoch": 0.20497566252028124, + "grad_norm": 3.25, + "learning_rate": 0.019994366647537242, + "loss": 3.4741, + "mean_token_accuracy": 0.38173428177833557, + "num_tokens": 386690203.0, + "step": 758 + }, + { + "epoch": 0.20524607896160085, + "grad_norm": 3.390625, + "learning_rate": 0.01999431073960309, + "loss": 3.4758, + "mean_token_accuracy": 0.3468461334705353, + "num_tokens": 387214342.0, + "step": 759 + }, + { + "epoch": 0.2055164954029205, + "grad_norm": 2.734375, + "learning_rate": 0.01999425455569703, + "loss": 3.2767, + "mean_token_accuracy": 0.3651367127895355, + "num_tokens": 387738427.0, + "step": 760 + }, + { + "epoch": 0.20578691184424014, + "grad_norm": 2.46875, + "learning_rate": 0.019994198095820797, + "loss": 3.386, + "mean_token_accuracy": 0.31748145818710327, + "num_tokens": 388237627.0, + "step": 761 + }, + { + "epoch": 0.20605732828555975, + "grad_norm": 2.140625, + "learning_rate": 0.019994141359976115, + "loss": 3.2957, + "mean_token_accuracy": 0.3731059730052948, + "num_tokens": 388761802.0, + "step": 762 + }, + { + "epoch": 0.2063277447268794, + "grad_norm": 3.15625, + "learning_rate": 0.01999408434816472, + "loss": 3.5611, + "mean_token_accuracy": 0.36816516518592834, + "num_tokens": 389224103.0, + "step": 763 + }, + { + "epoch": 0.206598161168199, + "grad_norm": 3.3125, + "learning_rate": 0.019994027060388376, + "loss": 3.5187, + "mean_token_accuracy": 0.37039780616760254, + "num_tokens": 389707308.0, + "step": 764 + }, + { + "epoch": 0.20686857760951866, + "grad_norm": 3.203125, + "learning_rate": 0.019993969496648827, + "loss": 3.4644, + "mean_token_accuracy": 0.3512711524963379, + "num_tokens": 390231483.0, + "step": 765 + }, + { + "epoch": 0.2071389940508383, + "grad_norm": 2.578125, + "learning_rate": 0.019993911656947846, + "loss": 3.4067, + "mean_token_accuracy": 0.3697565495967865, + "num_tokens": 390755557.0, + "step": 766 + }, + { + "epoch": 0.20740941049215791, + "grad_norm": 2.78125, + "learning_rate": 0.01999385354128721, + "loss": 3.2783, + "mean_token_accuracy": 0.33757853507995605, + "num_tokens": 391279741.0, + "step": 767 + }, + { + "epoch": 0.20767982693347756, + "grad_norm": 3.25, + "learning_rate": 0.019993795149668692, + "loss": 3.3915, + "mean_token_accuracy": 0.37869155406951904, + "num_tokens": 391784530.0, + "step": 768 + }, + { + "epoch": 0.2079502433747972, + "grad_norm": 3.421875, + "learning_rate": 0.01999373648209409, + "loss": 3.5041, + "mean_token_accuracy": 0.376791775226593, + "num_tokens": 392253956.0, + "step": 769 + }, + { + "epoch": 0.20822065981611682, + "grad_norm": 3.171875, + "learning_rate": 0.019993677538565204, + "loss": 3.5316, + "mean_token_accuracy": 0.3561003506183624, + "num_tokens": 392778070.0, + "step": 770 + }, + { + "epoch": 0.20849107625743646, + "grad_norm": 3.34375, + "learning_rate": 0.019993618319083844, + "loss": 9.9255, + "mean_token_accuracy": 0.0008453746559098363, + "num_tokens": 393302256.0, + "step": 771 + }, + { + "epoch": 0.20876149269875607, + "grad_norm": 7.375, + "learning_rate": 0.01999355882365183, + "loss": 4.0547, + "mean_token_accuracy": 0.279170960187912, + "num_tokens": 393826491.0, + "step": 772 + }, + { + "epoch": 0.20903190914007572, + "grad_norm": 3.0625, + "learning_rate": 0.019993499052270974, + "loss": 3.835, + "mean_token_accuracy": 0.3234271705150604, + "num_tokens": 394350762.0, + "step": 773 + }, + { + "epoch": 0.20930232558139536, + "grad_norm": 3.140625, + "learning_rate": 0.019993439004943124, + "loss": 3.5139, + "mean_token_accuracy": 0.3381469249725342, + "num_tokens": 394874974.0, + "step": 774 + }, + { + "epoch": 0.20957274202271498, + "grad_norm": 3.125, + "learning_rate": 0.019993378681670115, + "loss": 3.8028, + "mean_token_accuracy": 0.35492318868637085, + "num_tokens": 395375316.0, + "step": 775 + }, + { + "epoch": 0.20984315846403462, + "grad_norm": 4.21875, + "learning_rate": 0.0199933180824538, + "loss": 3.5039, + "mean_token_accuracy": 0.35878002643585205, + "num_tokens": 395872271.0, + "step": 776 + }, + { + "epoch": 0.21011357490535423, + "grad_norm": 3.265625, + "learning_rate": 0.019993257207296045, + "loss": 3.6075, + "mean_token_accuracy": 0.362656831741333, + "num_tokens": 396378691.0, + "step": 777 + }, + { + "epoch": 0.21038399134667388, + "grad_norm": 3.796875, + "learning_rate": 0.019993196056198707, + "loss": 3.5661, + "mean_token_accuracy": 0.3460753560066223, + "num_tokens": 396902853.0, + "step": 778 + }, + { + "epoch": 0.21065440778799352, + "grad_norm": 3.03125, + "learning_rate": 0.01999313462916366, + "loss": 3.6057, + "mean_token_accuracy": 0.3684259057044983, + "num_tokens": 397427014.0, + "step": 779 + }, + { + "epoch": 0.21092482422931313, + "grad_norm": 4.0, + "learning_rate": 0.019993072926192807, + "loss": 3.3294, + "mean_token_accuracy": 0.3798232078552246, + "num_tokens": 397951141.0, + "step": 780 + }, + { + "epoch": 0.21119524067063278, + "grad_norm": 3.078125, + "learning_rate": 0.01999301094728802, + "loss": 3.4106, + "mean_token_accuracy": 0.3607911467552185, + "num_tokens": 398475416.0, + "step": 781 + }, + { + "epoch": 0.2114656571119524, + "grad_norm": 3.5625, + "learning_rate": 0.019992948692451218, + "loss": 3.5935, + "mean_token_accuracy": 0.3495473563671112, + "num_tokens": 398999594.0, + "step": 782 + }, + { + "epoch": 0.21173607355327204, + "grad_norm": 2.921875, + "learning_rate": 0.0199928861616843, + "loss": 3.3518, + "mean_token_accuracy": 0.34192246198654175, + "num_tokens": 399523717.0, + "step": 783 + }, + { + "epoch": 0.21200648999459168, + "grad_norm": 2.8125, + "learning_rate": 0.01999282335498919, + "loss": 3.4556, + "mean_token_accuracy": 0.36382991075515747, + "num_tokens": 400047828.0, + "step": 784 + }, + { + "epoch": 0.2122769064359113, + "grad_norm": 3.21875, + "learning_rate": 0.019992760272367814, + "loss": 3.4849, + "mean_token_accuracy": 0.37508857250213623, + "num_tokens": 400572046.0, + "step": 785 + }, + { + "epoch": 0.21254732287723094, + "grad_norm": 2.78125, + "learning_rate": 0.019992696913822102, + "loss": 3.4773, + "mean_token_accuracy": 0.3769344985485077, + "num_tokens": 401032377.0, + "step": 786 + }, + { + "epoch": 0.21281773931855058, + "grad_norm": 3.46875, + "learning_rate": 0.019992633279354008, + "loss": 3.4175, + "mean_token_accuracy": 0.3453299403190613, + "num_tokens": 401556587.0, + "step": 787 + }, + { + "epoch": 0.2130881557598702, + "grad_norm": 3.53125, + "learning_rate": 0.019992569368965478, + "loss": 3.5612, + "mean_token_accuracy": 0.34084537625312805, + "num_tokens": 402080790.0, + "step": 788 + }, + { + "epoch": 0.21335857220118984, + "grad_norm": 2.921875, + "learning_rate": 0.019992505182658472, + "loss": 3.4663, + "mean_token_accuracy": 0.38602468371391296, + "num_tokens": 402555246.0, + "step": 789 + }, + { + "epoch": 0.21362898864250945, + "grad_norm": 2.71875, + "learning_rate": 0.019992440720434962, + "loss": 3.2843, + "mean_token_accuracy": 0.36380869150161743, + "num_tokens": 403025935.0, + "step": 790 + }, + { + "epoch": 0.2138994050838291, + "grad_norm": 57.75, + "learning_rate": 0.019992375982296928, + "loss": 10.6743, + "mean_token_accuracy": 2.233391751360614e-05, + "num_tokens": 403536485.0, + "step": 791 + }, + { + "epoch": 0.21416982152514874, + "grad_norm": 5.4375, + "learning_rate": 0.019992310968246354, + "loss": 4.0993, + "mean_token_accuracy": 0.32765311002731323, + "num_tokens": 404060594.0, + "step": 792 + }, + { + "epoch": 0.21444023796646836, + "grad_norm": 7.78125, + "learning_rate": 0.019992245678285236, + "loss": 3.2939, + "mean_token_accuracy": 0.3972037434577942, + "num_tokens": 404566275.0, + "step": 793 + }, + { + "epoch": 0.214710654407788, + "grad_norm": 2.453125, + "learning_rate": 0.019992180112415575, + "loss": 3.3955, + "mean_token_accuracy": 0.3659091591835022, + "num_tokens": 405057265.0, + "step": 794 + }, + { + "epoch": 0.21498107084910761, + "grad_norm": 3.65625, + "learning_rate": 0.01999211427063938, + "loss": 3.809, + "mean_token_accuracy": 0.3562856912612915, + "num_tokens": 405521375.0, + "step": 795 + }, + { + "epoch": 0.21525148729042726, + "grad_norm": 2.828125, + "learning_rate": 0.01999204815295868, + "loss": 3.2384, + "mean_token_accuracy": 0.36393487453460693, + "num_tokens": 406045534.0, + "step": 796 + }, + { + "epoch": 0.2155219037317469, + "grad_norm": 2.5, + "learning_rate": 0.019991981759375495, + "loss": 3.7494, + "mean_token_accuracy": 0.34531283378601074, + "num_tokens": 406569780.0, + "step": 797 + }, + { + "epoch": 0.21579232017306652, + "grad_norm": 3.859375, + "learning_rate": 0.019991915089891867, + "loss": 3.6656, + "mean_token_accuracy": 0.3329179286956787, + "num_tokens": 407094060.0, + "step": 798 + }, + { + "epoch": 0.21606273661438616, + "grad_norm": 3.359375, + "learning_rate": 0.01999184814450984, + "loss": 3.5166, + "mean_token_accuracy": 0.36310768127441406, + "num_tokens": 407618211.0, + "step": 799 + }, + { + "epoch": 0.21633315305570577, + "grad_norm": 3.65625, + "learning_rate": 0.01999178092323147, + "loss": 3.5716, + "mean_token_accuracy": 0.3613297939300537, + "num_tokens": 408103002.0, + "step": 800 + }, + { + "epoch": 0.21660356949702542, + "grad_norm": 3.375, + "learning_rate": 0.019991713426058818, + "loss": 3.4517, + "mean_token_accuracy": 0.3523766100406647, + "num_tokens": 408595773.0, + "step": 801 + }, + { + "epoch": 0.21687398593834506, + "grad_norm": 3.03125, + "learning_rate": 0.01999164565299396, + "loss": 3.1785, + "mean_token_accuracy": 0.3829103708267212, + "num_tokens": 409092754.0, + "step": 802 + }, + { + "epoch": 0.21714440237966467, + "grad_norm": 2.28125, + "learning_rate": 0.019991577604038965, + "loss": 3.4894, + "mean_token_accuracy": 0.3597593605518341, + "num_tokens": 409616917.0, + "step": 803 + }, + { + "epoch": 0.21741481882098432, + "grad_norm": 3.0625, + "learning_rate": 0.019991509279195924, + "loss": 3.4567, + "mean_token_accuracy": 0.35885995626449585, + "num_tokens": 410094437.0, + "step": 804 + }, + { + "epoch": 0.21768523526230396, + "grad_norm": 2.515625, + "learning_rate": 0.01999144067846694, + "loss": 3.3846, + "mean_token_accuracy": 0.3703562617301941, + "num_tokens": 410618576.0, + "step": 805 + }, + { + "epoch": 0.21795565170362358, + "grad_norm": 3.671875, + "learning_rate": 0.019991371801854115, + "loss": 3.5879, + "mean_token_accuracy": 0.37442755699157715, + "num_tokens": 411142682.0, + "step": 806 + }, + { + "epoch": 0.21822606814494322, + "grad_norm": 2.984375, + "learning_rate": 0.019991302649359558, + "loss": 3.4532, + "mean_token_accuracy": 0.36195096373558044, + "num_tokens": 411666860.0, + "step": 807 + }, + { + "epoch": 0.21849648458626283, + "grad_norm": 2.53125, + "learning_rate": 0.019991233220985397, + "loss": 3.524, + "mean_token_accuracy": 0.367381751537323, + "num_tokens": 412191073.0, + "step": 808 + }, + { + "epoch": 0.21876690102758248, + "grad_norm": 2.796875, + "learning_rate": 0.01999116351673376, + "loss": 3.4869, + "mean_token_accuracy": 0.35456833243370056, + "num_tokens": 412715221.0, + "step": 809 + }, + { + "epoch": 0.21903731746890212, + "grad_norm": 3.90625, + "learning_rate": 0.01999109353660678, + "loss": 3.3658, + "mean_token_accuracy": 0.374271959066391, + "num_tokens": 413214071.0, + "step": 810 + }, + { + "epoch": 0.21930773391022174, + "grad_norm": 3.0625, + "learning_rate": 0.019991023280606614, + "loss": 11.3353, + "mean_token_accuracy": 0.0, + "num_tokens": 413738191.0, + "step": 811 + }, + { + "epoch": 0.21957815035154138, + "grad_norm": 11.75, + "learning_rate": 0.01999095274873541, + "loss": 4.3611, + "mean_token_accuracy": 0.29636508226394653, + "num_tokens": 414262372.0, + "step": 812 + }, + { + "epoch": 0.219848566792861, + "grad_norm": 2.25, + "learning_rate": 0.01999088194099534, + "loss": 3.5599, + "mean_token_accuracy": 0.3485286235809326, + "num_tokens": 414770297.0, + "step": 813 + }, + { + "epoch": 0.22011898323418064, + "grad_norm": 3.0625, + "learning_rate": 0.019990810857388564, + "loss": 3.4144, + "mean_token_accuracy": 0.380728542804718, + "num_tokens": 415268663.0, + "step": 814 + }, + { + "epoch": 0.22038939967550028, + "grad_norm": 4.28125, + "learning_rate": 0.01999073949791727, + "loss": 3.612, + "mean_token_accuracy": 0.33333078026771545, + "num_tokens": 415748069.0, + "step": 815 + }, + { + "epoch": 0.2206598161168199, + "grad_norm": 2.0625, + "learning_rate": 0.019990667862583657, + "loss": 3.2233, + "mean_token_accuracy": 0.3953731060028076, + "num_tokens": 416269186.0, + "step": 816 + }, + { + "epoch": 0.22093023255813954, + "grad_norm": 3.109375, + "learning_rate": 0.019990595951389908, + "loss": 3.5574, + "mean_token_accuracy": 0.3743823766708374, + "num_tokens": 416685802.0, + "step": 817 + }, + { + "epoch": 0.22120064899945915, + "grad_norm": 3.765625, + "learning_rate": 0.019990523764338238, + "loss": 3.3674, + "mean_token_accuracy": 0.34320324659347534, + "num_tokens": 417209896.0, + "step": 818 + }, + { + "epoch": 0.2214710654407788, + "grad_norm": 3.515625, + "learning_rate": 0.01999045130143086, + "loss": 3.591, + "mean_token_accuracy": 0.34206724166870117, + "num_tokens": 417734077.0, + "step": 819 + }, + { + "epoch": 0.22174148188209844, + "grad_norm": 37.75, + "learning_rate": 0.019990378562669994, + "loss": 3.5338, + "mean_token_accuracy": 0.3332154154777527, + "num_tokens": 418258307.0, + "step": 820 + }, + { + "epoch": 0.22201189832341806, + "grad_norm": 5.3125, + "learning_rate": 0.019990305548057877, + "loss": 3.8157, + "mean_token_accuracy": 0.3213635981082916, + "num_tokens": 418776523.0, + "step": 821 + }, + { + "epoch": 0.2222823147647377, + "grad_norm": 3.171875, + "learning_rate": 0.019990232257596746, + "loss": 3.5777, + "mean_token_accuracy": 0.3206520676612854, + "num_tokens": 419300797.0, + "step": 822 + }, + { + "epoch": 0.22255273120605734, + "grad_norm": 2.921875, + "learning_rate": 0.01999015869128885, + "loss": 3.5756, + "mean_token_accuracy": 0.3597418963909149, + "num_tokens": 419824994.0, + "step": 823 + }, + { + "epoch": 0.22282314764737696, + "grad_norm": 3.640625, + "learning_rate": 0.019990084849136447, + "loss": 3.5408, + "mean_token_accuracy": 0.3504372239112854, + "num_tokens": 420349276.0, + "step": 824 + }, + { + "epoch": 0.2230935640886966, + "grad_norm": 2.625, + "learning_rate": 0.019990010731141806, + "loss": 3.4358, + "mean_token_accuracy": 0.35083627700805664, + "num_tokens": 420873557.0, + "step": 825 + }, + { + "epoch": 0.22336398053001622, + "grad_norm": 3.578125, + "learning_rate": 0.019989936337307196, + "loss": 3.5487, + "mean_token_accuracy": 0.35474693775177, + "num_tokens": 421397620.0, + "step": 826 + }, + { + "epoch": 0.22363439697133586, + "grad_norm": 2.671875, + "learning_rate": 0.0199898616676349, + "loss": 3.3748, + "mean_token_accuracy": 0.3825027346611023, + "num_tokens": 421921830.0, + "step": 827 + }, + { + "epoch": 0.2239048134126555, + "grad_norm": 2.921875, + "learning_rate": 0.01998978672212721, + "loss": 3.2852, + "mean_token_accuracy": 0.3662080764770508, + "num_tokens": 422446018.0, + "step": 828 + }, + { + "epoch": 0.22417522985397512, + "grad_norm": 2.5, + "learning_rate": 0.01998971150078643, + "loss": 3.4814, + "mean_token_accuracy": 0.3659335970878601, + "num_tokens": 422970184.0, + "step": 829 + }, + { + "epoch": 0.22444564629529476, + "grad_norm": 2.84375, + "learning_rate": 0.01998963600361486, + "loss": 3.2954, + "mean_token_accuracy": 0.3703540861606598, + "num_tokens": 423494371.0, + "step": 830 + }, + { + "epoch": 0.22471606273661437, + "grad_norm": 7.21875, + "learning_rate": 0.019989560230614822, + "loss": 9.7235, + "mean_token_accuracy": 0.006573945749551058, + "num_tokens": 424018579.0, + "step": 831 + }, + { + "epoch": 0.22498647917793402, + "grad_norm": 6.625, + "learning_rate": 0.01998948418178864, + "loss": 3.8831, + "mean_token_accuracy": 0.33127841353416443, + "num_tokens": 424542734.0, + "step": 832 + }, + { + "epoch": 0.22525689561925366, + "grad_norm": 2.390625, + "learning_rate": 0.01998940785713865, + "loss": 3.3965, + "mean_token_accuracy": 0.3739239573478699, + "num_tokens": 425022258.0, + "step": 833 + }, + { + "epoch": 0.22552731206057328, + "grad_norm": 3.015625, + "learning_rate": 0.019989331256667187, + "loss": 3.5901, + "mean_token_accuracy": 0.35007986426353455, + "num_tokens": 425535792.0, + "step": 834 + }, + { + "epoch": 0.22579772850189292, + "grad_norm": 3.921875, + "learning_rate": 0.019989254380376607, + "loss": 3.6895, + "mean_token_accuracy": 0.34323590993881226, + "num_tokens": 426060073.0, + "step": 835 + }, + { + "epoch": 0.22606814494321253, + "grad_norm": 3.109375, + "learning_rate": 0.019989177228269265, + "loss": 3.3814, + "mean_token_accuracy": 0.3642979562282562, + "num_tokens": 426552554.0, + "step": 836 + }, + { + "epoch": 0.22633856138453218, + "grad_norm": 3.71875, + "learning_rate": 0.01998909980034753, + "loss": 3.3952, + "mean_token_accuracy": 0.38041192293167114, + "num_tokens": 427076740.0, + "step": 837 + }, + { + "epoch": 0.22660897782585182, + "grad_norm": 3.1875, + "learning_rate": 0.01998902209661378, + "loss": 3.6732, + "mean_token_accuracy": 0.33332201838493347, + "num_tokens": 427600927.0, + "step": 838 + }, + { + "epoch": 0.22687939426717144, + "grad_norm": 3.25, + "learning_rate": 0.019988944117070397, + "loss": 3.1668, + "mean_token_accuracy": 0.39327794313430786, + "num_tokens": 428125113.0, + "step": 839 + }, + { + "epoch": 0.22714981070849108, + "grad_norm": 2.71875, + "learning_rate": 0.019988865861719773, + "loss": 3.579, + "mean_token_accuracy": 0.3430555760860443, + "num_tokens": 428649243.0, + "step": 840 + }, + { + "epoch": 0.22742022714981072, + "grad_norm": 3.03125, + "learning_rate": 0.01998878733056431, + "loss": 3.4313, + "mean_token_accuracy": 0.35789304971694946, + "num_tokens": 429161539.0, + "step": 841 + }, + { + "epoch": 0.22769064359113034, + "grad_norm": 4.125, + "learning_rate": 0.019988708523606416, + "loss": 3.5922, + "mean_token_accuracy": 0.373357355594635, + "num_tokens": 429685772.0, + "step": 842 + }, + { + "epoch": 0.22796106003244998, + "grad_norm": 4.1875, + "learning_rate": 0.019988629440848515, + "loss": 3.6357, + "mean_token_accuracy": 0.3585886061191559, + "num_tokens": 430153115.0, + "step": 843 + }, + { + "epoch": 0.2282314764737696, + "grad_norm": 2.21875, + "learning_rate": 0.019988550082293024, + "loss": 3.3541, + "mean_token_accuracy": 0.3703112304210663, + "num_tokens": 430677328.0, + "step": 844 + }, + { + "epoch": 0.22850189291508924, + "grad_norm": 3.0, + "learning_rate": 0.019988470447942382, + "loss": 3.4578, + "mean_token_accuracy": 0.3573242425918579, + "num_tokens": 431201461.0, + "step": 845 + }, + { + "epoch": 0.22877230935640888, + "grad_norm": 3.390625, + "learning_rate": 0.019988390537799036, + "loss": 3.2888, + "mean_token_accuracy": 0.3812709450721741, + "num_tokens": 431725736.0, + "step": 846 + }, + { + "epoch": 0.2290427257977285, + "grad_norm": 2.671875, + "learning_rate": 0.01998831035186544, + "loss": 3.228, + "mean_token_accuracy": 0.40295684337615967, + "num_tokens": 432216704.0, + "step": 847 + }, + { + "epoch": 0.22931314223904814, + "grad_norm": 3.140625, + "learning_rate": 0.019988229890144038, + "loss": 3.6402, + "mean_token_accuracy": 0.3560306429862976, + "num_tokens": 432736816.0, + "step": 848 + }, + { + "epoch": 0.22958355868036776, + "grad_norm": 3.203125, + "learning_rate": 0.01998814915263732, + "loss": 3.5681, + "mean_token_accuracy": 0.36172670125961304, + "num_tokens": 433261096.0, + "step": 849 + }, + { + "epoch": 0.2298539751216874, + "grad_norm": 4.5, + "learning_rate": 0.01998806813934775, + "loss": 3.4348, + "mean_token_accuracy": 0.3515021800994873, + "num_tokens": 433785322.0, + "step": 850 + }, + { + "epoch": 0.23012439156300704, + "grad_norm": 59.5, + "learning_rate": 0.019987986850277816, + "loss": 18.1873, + "mean_token_accuracy": 0.03697405755519867, + "num_tokens": 434268573.0, + "step": 851 + }, + { + "epoch": 0.23039480800432666, + "grad_norm": 8.375, + "learning_rate": 0.019987905285430013, + "loss": 3.9408, + "mean_token_accuracy": 0.3040441870689392, + "num_tokens": 434792790.0, + "step": 852 + }, + { + "epoch": 0.2306652244456463, + "grad_norm": 3.1875, + "learning_rate": 0.019987823444806846, + "loss": 3.4826, + "mean_token_accuracy": 0.3639627695083618, + "num_tokens": 435306111.0, + "step": 853 + }, + { + "epoch": 0.23093564088696591, + "grad_norm": 4.21875, + "learning_rate": 0.019987741328410826, + "loss": 3.6641, + "mean_token_accuracy": 0.3675260543823242, + "num_tokens": 435778422.0, + "step": 854 + }, + { + "epoch": 0.23120605732828556, + "grad_norm": 3.109375, + "learning_rate": 0.01998765893624447, + "loss": 3.3523, + "mean_token_accuracy": 0.3819577693939209, + "num_tokens": 436302676.0, + "step": 855 + }, + { + "epoch": 0.2314764737696052, + "grad_norm": 3.921875, + "learning_rate": 0.019987576268310304, + "loss": 3.6243, + "mean_token_accuracy": 0.36807262897491455, + "num_tokens": 436826687.0, + "step": 856 + }, + { + "epoch": 0.23174689021092482, + "grad_norm": 3.984375, + "learning_rate": 0.019987493324610868, + "loss": 3.4831, + "mean_token_accuracy": 0.3544316291809082, + "num_tokens": 437340397.0, + "step": 857 + }, + { + "epoch": 0.23201730665224446, + "grad_norm": 3.109375, + "learning_rate": 0.01998741010514871, + "loss": 3.6343, + "mean_token_accuracy": 0.37296998500823975, + "num_tokens": 437864630.0, + "step": 858 + }, + { + "epoch": 0.2322877230935641, + "grad_norm": 3.75, + "learning_rate": 0.019987326609926376, + "loss": 3.7117, + "mean_token_accuracy": 0.32913142442703247, + "num_tokens": 438388913.0, + "step": 859 + }, + { + "epoch": 0.23255813953488372, + "grad_norm": 51.75, + "learning_rate": 0.019987242838946435, + "loss": 4.7828, + "mean_token_accuracy": 0.32924574613571167, + "num_tokens": 438913193.0, + "step": 860 + }, + { + "epoch": 0.23282855597620336, + "grad_norm": 3.78125, + "learning_rate": 0.01998715879221145, + "loss": 3.833, + "mean_token_accuracy": 0.38072967529296875, + "num_tokens": 439371856.0, + "step": 861 + }, + { + "epoch": 0.23309897241752298, + "grad_norm": 2.296875, + "learning_rate": 0.019987074469724007, + "loss": 3.5057, + "mean_token_accuracy": 0.3675452470779419, + "num_tokens": 439896006.0, + "step": 862 + }, + { + "epoch": 0.23336938885884262, + "grad_norm": 3.15625, + "learning_rate": 0.01998698987148669, + "loss": 3.4674, + "mean_token_accuracy": 0.3777623176574707, + "num_tokens": 440396580.0, + "step": 863 + }, + { + "epoch": 0.23363980530016226, + "grad_norm": 2.203125, + "learning_rate": 0.01998690499750209, + "loss": 3.2755, + "mean_token_accuracy": 0.3772302567958832, + "num_tokens": 440920731.0, + "step": 864 + }, + { + "epoch": 0.23391022174148188, + "grad_norm": 2.96875, + "learning_rate": 0.01998681984777282, + "loss": 3.6505, + "mean_token_accuracy": 0.3557910621166229, + "num_tokens": 441444856.0, + "step": 865 + }, + { + "epoch": 0.23418063818280152, + "grad_norm": 3.734375, + "learning_rate": 0.01998673442230149, + "loss": 3.615, + "mean_token_accuracy": 0.35815173387527466, + "num_tokens": 441969080.0, + "step": 866 + }, + { + "epoch": 0.23445105462412114, + "grad_norm": 3.578125, + "learning_rate": 0.019986648721090718, + "loss": 3.3071, + "mean_token_accuracy": 0.3657049834728241, + "num_tokens": 442493290.0, + "step": 867 + }, + { + "epoch": 0.23472147106544078, + "grad_norm": 3.515625, + "learning_rate": 0.019986562744143132, + "loss": 3.3481, + "mean_token_accuracy": 0.3693435490131378, + "num_tokens": 443017355.0, + "step": 868 + }, + { + "epoch": 0.23499188750676042, + "grad_norm": 2.359375, + "learning_rate": 0.019986476491461375, + "loss": 3.42, + "mean_token_accuracy": 0.3772428333759308, + "num_tokens": 443541458.0, + "step": 869 + }, + { + "epoch": 0.23526230394808004, + "grad_norm": 3.765625, + "learning_rate": 0.019986389963048096, + "loss": 3.3175, + "mean_token_accuracy": 0.37876224517822266, + "num_tokens": 444065633.0, + "step": 870 + }, + { + "epoch": 0.23553272038939968, + "grad_norm": 115.0, + "learning_rate": 0.01998630315890594, + "loss": 16.0516, + "mean_token_accuracy": 0.0002084575389744714, + "num_tokens": 444568859.0, + "step": 871 + }, + { + "epoch": 0.2358031368307193, + "grad_norm": 7.8125, + "learning_rate": 0.01998621607903758, + "loss": 4.0508, + "mean_token_accuracy": 0.23986075818538666, + "num_tokens": 445093126.0, + "step": 872 + }, + { + "epoch": 0.23607355327203894, + "grad_norm": 2.5, + "learning_rate": 0.01998612872344568, + "loss": 3.807, + "mean_token_accuracy": 0.32564547657966614, + "num_tokens": 445617408.0, + "step": 873 + }, + { + "epoch": 0.23634396971335858, + "grad_norm": 2.953125, + "learning_rate": 0.01998604109213293, + "loss": 3.5383, + "mean_token_accuracy": 0.3677964210510254, + "num_tokens": 446141579.0, + "step": 874 + }, + { + "epoch": 0.2366143861546782, + "grad_norm": 4.3125, + "learning_rate": 0.01998595318510201, + "loss": 3.4939, + "mean_token_accuracy": 0.35995516180992126, + "num_tokens": 446665789.0, + "step": 875 + }, + { + "epoch": 0.23688480259599784, + "grad_norm": 3.421875, + "learning_rate": 0.019985865002355618, + "loss": 3.337, + "mean_token_accuracy": 0.3793039917945862, + "num_tokens": 447190010.0, + "step": 876 + }, + { + "epoch": 0.23715521903731746, + "grad_norm": 2.453125, + "learning_rate": 0.019985776543896466, + "loss": 3.656, + "mean_token_accuracy": 0.35535728931427, + "num_tokens": 447714269.0, + "step": 877 + }, + { + "epoch": 0.2374256354786371, + "grad_norm": 3.953125, + "learning_rate": 0.019985687809727263, + "loss": 3.7393, + "mean_token_accuracy": 0.2933887541294098, + "num_tokens": 448238529.0, + "step": 878 + }, + { + "epoch": 0.23769605191995674, + "grad_norm": 32.0, + "learning_rate": 0.01998559879985073, + "loss": 4.3963, + "mean_token_accuracy": 0.30478835105895996, + "num_tokens": 448719753.0, + "step": 879 + }, + { + "epoch": 0.23796646836127636, + "grad_norm": 3.609375, + "learning_rate": 0.019985509514269608, + "loss": 3.4911, + "mean_token_accuracy": 0.3352075219154358, + "num_tokens": 449243934.0, + "step": 880 + }, + { + "epoch": 0.238236884802596, + "grad_norm": 3.640625, + "learning_rate": 0.019985419952986617, + "loss": 3.4673, + "mean_token_accuracy": 0.34001702070236206, + "num_tokens": 449768191.0, + "step": 881 + }, + { + "epoch": 0.23850730124391564, + "grad_norm": 4.4375, + "learning_rate": 0.019985330116004528, + "loss": 3.2889, + "mean_token_accuracy": 0.38830217719078064, + "num_tokens": 450292445.0, + "step": 882 + }, + { + "epoch": 0.23877771768523526, + "grad_norm": 2.875, + "learning_rate": 0.019985240003326084, + "loss": 3.4565, + "mean_token_accuracy": 0.37809062004089355, + "num_tokens": 450796681.0, + "step": 883 + }, + { + "epoch": 0.2390481341265549, + "grad_norm": 3.15625, + "learning_rate": 0.019985149614954054, + "loss": 3.4295, + "mean_token_accuracy": 0.34870803356170654, + "num_tokens": 451320861.0, + "step": 884 + }, + { + "epoch": 0.23931855056787452, + "grad_norm": 3.1875, + "learning_rate": 0.019985058950891206, + "loss": 3.3938, + "mean_token_accuracy": 0.359893798828125, + "num_tokens": 451845121.0, + "step": 885 + }, + { + "epoch": 0.23958896700919416, + "grad_norm": 2.828125, + "learning_rate": 0.01998496801114033, + "loss": 3.4943, + "mean_token_accuracy": 0.35637232661247253, + "num_tokens": 452369297.0, + "step": 886 + }, + { + "epoch": 0.2398593834505138, + "grad_norm": 2.3125, + "learning_rate": 0.019984876795704213, + "loss": 3.404, + "mean_token_accuracy": 0.352289080619812, + "num_tokens": 452893559.0, + "step": 887 + }, + { + "epoch": 0.24012979989183342, + "grad_norm": 2.6875, + "learning_rate": 0.01998478530458565, + "loss": 3.4326, + "mean_token_accuracy": 0.3574836552143097, + "num_tokens": 453417602.0, + "step": 888 + }, + { + "epoch": 0.24040021633315306, + "grad_norm": 2.65625, + "learning_rate": 0.01998469353778745, + "loss": 3.336, + "mean_token_accuracy": 0.36581701040267944, + "num_tokens": 453941802.0, + "step": 889 + }, + { + "epoch": 0.24067063277447268, + "grad_norm": 3.0625, + "learning_rate": 0.019984601495312438, + "loss": 3.4992, + "mean_token_accuracy": 0.35524892807006836, + "num_tokens": 454466071.0, + "step": 890 + }, + { + "epoch": 0.24094104921579232, + "grad_norm": 75.5, + "learning_rate": 0.019984509177163424, + "loss": 11.9954, + "mean_token_accuracy": 0.024060983210802078, + "num_tokens": 454953343.0, + "step": 891 + }, + { + "epoch": 0.24121146565711196, + "grad_norm": 7.21875, + "learning_rate": 0.019984416583343247, + "loss": 4.1537, + "mean_token_accuracy": 0.3333004415035248, + "num_tokens": 455477593.0, + "step": 892 + }, + { + "epoch": 0.24148188209843158, + "grad_norm": 3.25, + "learning_rate": 0.019984323713854748, + "loss": 3.6651, + "mean_token_accuracy": 0.3373591899871826, + "num_tokens": 456001755.0, + "step": 893 + }, + { + "epoch": 0.24175229853975122, + "grad_norm": 4.15625, + "learning_rate": 0.019984230568700776, + "loss": 3.5224, + "mean_token_accuracy": 0.3474406599998474, + "num_tokens": 456526028.0, + "step": 894 + }, + { + "epoch": 0.24202271498107084, + "grad_norm": 3.234375, + "learning_rate": 0.01998413714788419, + "loss": 3.4262, + "mean_token_accuracy": 0.3661491870880127, + "num_tokens": 457050235.0, + "step": 895 + }, + { + "epoch": 0.24229313142239048, + "grad_norm": 3.34375, + "learning_rate": 0.019984043451407855, + "loss": 3.7457, + "mean_token_accuracy": 0.3410283923149109, + "num_tokens": 457574441.0, + "step": 896 + }, + { + "epoch": 0.24256354786371012, + "grad_norm": 2.75, + "learning_rate": 0.01998394947927465, + "loss": 3.2978, + "mean_token_accuracy": 0.37130826711654663, + "num_tokens": 458098564.0, + "step": 897 + }, + { + "epoch": 0.24283396430502974, + "grad_norm": 3.328125, + "learning_rate": 0.01998385523148745, + "loss": 3.3788, + "mean_token_accuracy": 0.3498883843421936, + "num_tokens": 458622668.0, + "step": 898 + }, + { + "epoch": 0.24310438074634938, + "grad_norm": 2.1875, + "learning_rate": 0.019983760708049154, + "loss": 3.2776, + "mean_token_accuracy": 0.3740672469139099, + "num_tokens": 459146807.0, + "step": 899 + }, + { + "epoch": 0.24337479718766902, + "grad_norm": 2.859375, + "learning_rate": 0.01998366590896266, + "loss": 3.3585, + "mean_token_accuracy": 0.35805612802505493, + "num_tokens": 459671085.0, + "step": 900 + }, + { + "epoch": 0.24364521362898864, + "grad_norm": 2.734375, + "learning_rate": 0.019983570834230874, + "loss": 3.4023, + "mean_token_accuracy": 0.3710271716117859, + "num_tokens": 460195306.0, + "step": 901 + }, + { + "epoch": 0.24391563007030828, + "grad_norm": 3.578125, + "learning_rate": 0.019983475483856723, + "loss": 3.5678, + "mean_token_accuracy": 0.3262220621109009, + "num_tokens": 460719524.0, + "step": 902 + }, + { + "epoch": 0.2441860465116279, + "grad_norm": 3.46875, + "learning_rate": 0.01998337985784312, + "loss": 3.5779, + "mean_token_accuracy": 0.35766029357910156, + "num_tokens": 461213742.0, + "step": 903 + }, + { + "epoch": 0.24445646295294754, + "grad_norm": 3.234375, + "learning_rate": 0.019983283956193004, + "loss": 3.5844, + "mean_token_accuracy": 0.36210983991622925, + "num_tokens": 461737829.0, + "step": 904 + }, + { + "epoch": 0.24472687939426718, + "grad_norm": 3.625, + "learning_rate": 0.01998318777890932, + "loss": 3.5098, + "mean_token_accuracy": 0.37839892506599426, + "num_tokens": 462202844.0, + "step": 905 + }, + { + "epoch": 0.2449972958355868, + "grad_norm": 2.921875, + "learning_rate": 0.019983091325995014, + "loss": 3.1765, + "mean_token_accuracy": 0.40535876154899597, + "num_tokens": 462666880.0, + "step": 906 + }, + { + "epoch": 0.24526771227690644, + "grad_norm": 3.140625, + "learning_rate": 0.01998299459745305, + "loss": 3.4306, + "mean_token_accuracy": 0.35349541902542114, + "num_tokens": 463191125.0, + "step": 907 + }, + { + "epoch": 0.24553812871822606, + "grad_norm": 3.109375, + "learning_rate": 0.019982897593286395, + "loss": 3.7292, + "mean_token_accuracy": 0.35645902156829834, + "num_tokens": 463689978.0, + "step": 908 + }, + { + "epoch": 0.2458085451595457, + "grad_norm": 2.5625, + "learning_rate": 0.019982800313498025, + "loss": 3.4161, + "mean_token_accuracy": 0.36174923181533813, + "num_tokens": 464213984.0, + "step": 909 + }, + { + "epoch": 0.24607896160086534, + "grad_norm": 3.15625, + "learning_rate": 0.01998270275809093, + "loss": 3.3453, + "mean_token_accuracy": 0.38715261220932007, + "num_tokens": 464738214.0, + "step": 910 + }, + { + "epoch": 0.24634937804218496, + "grad_norm": 124.0, + "learning_rate": 0.01998260492706809, + "loss": 13.3618, + "mean_token_accuracy": 3.223032763344236e-05, + "num_tokens": 465262448.0, + "step": 911 + }, + { + "epoch": 0.2466197944835046, + "grad_norm": 6.0, + "learning_rate": 0.019982506820432514, + "loss": 4.0042, + "mean_token_accuracy": 0.32208219170570374, + "num_tokens": 465786636.0, + "step": 912 + }, + { + "epoch": 0.24689021092482422, + "grad_norm": 2.515625, + "learning_rate": 0.019982408438187223, + "loss": 3.46, + "mean_token_accuracy": 0.35560551285743713, + "num_tokens": 466310746.0, + "step": 913 + }, + { + "epoch": 0.24716062736614386, + "grad_norm": 3.96875, + "learning_rate": 0.019982309780335217, + "loss": 3.5714, + "mean_token_accuracy": 0.3759494423866272, + "num_tokens": 466834879.0, + "step": 914 + }, + { + "epoch": 0.2474310438074635, + "grad_norm": 3.421875, + "learning_rate": 0.019982210846879535, + "loss": 3.3313, + "mean_token_accuracy": 0.3703067898750305, + "num_tokens": 467359145.0, + "step": 915 + }, + { + "epoch": 0.24770146024878312, + "grad_norm": 2.625, + "learning_rate": 0.019982111637823208, + "loss": 3.5332, + "mean_token_accuracy": 0.36034584045410156, + "num_tokens": 467883282.0, + "step": 916 + }, + { + "epoch": 0.24797187669010276, + "grad_norm": 2.78125, + "learning_rate": 0.019982012153169283, + "loss": 3.3785, + "mean_token_accuracy": 0.3656262457370758, + "num_tokens": 468407380.0, + "step": 917 + }, + { + "epoch": 0.2482422931314224, + "grad_norm": 2.796875, + "learning_rate": 0.019981912392920808, + "loss": 3.475, + "mean_token_accuracy": 0.35967129468917847, + "num_tokens": 468911990.0, + "step": 918 + }, + { + "epoch": 0.24851270957274202, + "grad_norm": 3.234375, + "learning_rate": 0.01998181235708085, + "loss": 3.5257, + "mean_token_accuracy": 0.368042528629303, + "num_tokens": 469369094.0, + "step": 919 + }, + { + "epoch": 0.24878312601406166, + "grad_norm": 2.6875, + "learning_rate": 0.019981712045652467, + "loss": 3.5927, + "mean_token_accuracy": 0.35959744453430176, + "num_tokens": 469893298.0, + "step": 920 + }, + { + "epoch": 0.24905354245538128, + "grad_norm": 2.21875, + "learning_rate": 0.019981611458638752, + "loss": 3.2473, + "mean_token_accuracy": 0.3725181221961975, + "num_tokens": 470362529.0, + "step": 921 + }, + { + "epoch": 0.24932395889670092, + "grad_norm": 2.828125, + "learning_rate": 0.019981510596042786, + "loss": 3.5052, + "mean_token_accuracy": 0.3607480525970459, + "num_tokens": 470883173.0, + "step": 922 + }, + { + "epoch": 0.24959437533802056, + "grad_norm": 2.59375, + "learning_rate": 0.019981409457867656, + "loss": 3.4267, + "mean_token_accuracy": 0.3684951066970825, + "num_tokens": 471407440.0, + "step": 923 + }, + { + "epoch": 0.24986479177934018, + "grad_norm": 2.15625, + "learning_rate": 0.019981308044116474, + "loss": 3.3627, + "mean_token_accuracy": 0.36582791805267334, + "num_tokens": 471931658.0, + "step": 924 + }, + { + "epoch": 0.2501352082206598, + "grad_norm": 3.390625, + "learning_rate": 0.019981206354792353, + "loss": 3.6074, + "mean_token_accuracy": 0.3527907729148865, + "num_tokens": 472433293.0, + "step": 925 + }, + { + "epoch": 0.25040562466197946, + "grad_norm": 3.953125, + "learning_rate": 0.019981104389898405, + "loss": 3.5286, + "mean_token_accuracy": 0.36734652519226074, + "num_tokens": 472957448.0, + "step": 926 + }, + { + "epoch": 0.2506760411032991, + "grad_norm": 2.671875, + "learning_rate": 0.019981002149437762, + "loss": 3.4384, + "mean_token_accuracy": 0.36195069551467896, + "num_tokens": 473481685.0, + "step": 927 + }, + { + "epoch": 0.2509464575446187, + "grad_norm": 22.375, + "learning_rate": 0.019980899633413565, + "loss": 3.6947, + "mean_token_accuracy": 0.333050012588501, + "num_tokens": 473967590.0, + "step": 928 + }, + { + "epoch": 0.25121687398593834, + "grad_norm": 3.890625, + "learning_rate": 0.01998079684182895, + "loss": 3.709, + "mean_token_accuracy": 0.3363588750362396, + "num_tokens": 474484083.0, + "step": 929 + }, + { + "epoch": 0.251487290427258, + "grad_norm": 2.640625, + "learning_rate": 0.01998069377468708, + "loss": 3.5787, + "mean_token_accuracy": 0.35703980922698975, + "num_tokens": 475008346.0, + "step": 930 + }, + { + "epoch": 0.2517577068685776, + "grad_norm": 60.75, + "learning_rate": 0.019980590431991116, + "loss": 14.7709, + "mean_token_accuracy": 0.016123535111546516, + "num_tokens": 475532622.0, + "step": 931 + }, + { + "epoch": 0.25202812330989727, + "grad_norm": 8.1875, + "learning_rate": 0.01998048681374423, + "loss": 4.3093, + "mean_token_accuracy": 0.26348191499710083, + "num_tokens": 476056863.0, + "step": 932 + }, + { + "epoch": 0.25229853975121685, + "grad_norm": 2.671875, + "learning_rate": 0.019980382919949594, + "loss": 3.7097, + "mean_token_accuracy": 0.321442574262619, + "num_tokens": 476581094.0, + "step": 933 + }, + { + "epoch": 0.2525689561925365, + "grad_norm": 2.765625, + "learning_rate": 0.0199802787506104, + "loss": 3.3004, + "mean_token_accuracy": 0.3741699457168579, + "num_tokens": 477105355.0, + "step": 934 + }, + { + "epoch": 0.25283937263385614, + "grad_norm": 6.15625, + "learning_rate": 0.019980174305729848, + "loss": 3.1812, + "mean_token_accuracy": 0.4309217631816864, + "num_tokens": 477629471.0, + "step": 935 + }, + { + "epoch": 0.2531097890751758, + "grad_norm": 3.21875, + "learning_rate": 0.01998006958531114, + "loss": 3.266, + "mean_token_accuracy": 0.36922502517700195, + "num_tokens": 478130113.0, + "step": 936 + }, + { + "epoch": 0.2533802055164954, + "grad_norm": 4.875, + "learning_rate": 0.019979964589357482, + "loss": 3.3292, + "mean_token_accuracy": 0.34448739886283875, + "num_tokens": 478654368.0, + "step": 937 + }, + { + "epoch": 0.253650621957815, + "grad_norm": 2.9375, + "learning_rate": 0.01997985931787211, + "loss": 3.3857, + "mean_token_accuracy": 0.36559152603149414, + "num_tokens": 479178583.0, + "step": 938 + }, + { + "epoch": 0.25392103839913466, + "grad_norm": 4.1875, + "learning_rate": 0.01997975377085824, + "loss": 3.5503, + "mean_token_accuracy": 0.35755228996276855, + "num_tokens": 479694970.0, + "step": 939 + }, + { + "epoch": 0.2541914548404543, + "grad_norm": 3.09375, + "learning_rate": 0.01997964794831912, + "loss": 3.4404, + "mean_token_accuracy": 0.36960920691490173, + "num_tokens": 480178994.0, + "step": 940 + }, + { + "epoch": 0.25446187128177394, + "grad_norm": 2.859375, + "learning_rate": 0.019979541850257992, + "loss": 3.542, + "mean_token_accuracy": 0.3642583191394806, + "num_tokens": 480703274.0, + "step": 941 + }, + { + "epoch": 0.2547322877230936, + "grad_norm": 2.4375, + "learning_rate": 0.01997943547667811, + "loss": 3.2613, + "mean_token_accuracy": 0.3781517744064331, + "num_tokens": 481227411.0, + "step": 942 + }, + { + "epoch": 0.2550027041644132, + "grad_norm": 3.109375, + "learning_rate": 0.019979328827582746, + "loss": 3.4409, + "mean_token_accuracy": 0.3616757094860077, + "num_tokens": 481751518.0, + "step": 943 + }, + { + "epoch": 0.2552731206057328, + "grad_norm": 2.96875, + "learning_rate": 0.019979221902975165, + "loss": 3.5572, + "mean_token_accuracy": 0.36075359582901, + "num_tokens": 482232944.0, + "step": 944 + }, + { + "epoch": 0.25554353704705246, + "grad_norm": 2.953125, + "learning_rate": 0.019979114702858652, + "loss": 3.5616, + "mean_token_accuracy": 0.3225741386413574, + "num_tokens": 482756913.0, + "step": 945 + }, + { + "epoch": 0.2558139534883721, + "grad_norm": 2.46875, + "learning_rate": 0.019979007227236494, + "loss": 3.4163, + "mean_token_accuracy": 0.34602904319763184, + "num_tokens": 483281168.0, + "step": 946 + }, + { + "epoch": 0.25608436992969175, + "grad_norm": 3.21875, + "learning_rate": 0.019978899476111985, + "loss": 3.3325, + "mean_token_accuracy": 0.3614712357521057, + "num_tokens": 483805430.0, + "step": 947 + }, + { + "epoch": 0.25635478637101133, + "grad_norm": 2.984375, + "learning_rate": 0.019978791449488435, + "loss": 3.225, + "mean_token_accuracy": 0.3820565938949585, + "num_tokens": 484272301.0, + "step": 948 + }, + { + "epoch": 0.256625202812331, + "grad_norm": 2.71875, + "learning_rate": 0.019978683147369163, + "loss": 3.3514, + "mean_token_accuracy": 0.38113802671432495, + "num_tokens": 484796526.0, + "step": 949 + }, + { + "epoch": 0.2568956192536506, + "grad_norm": 3.359375, + "learning_rate": 0.019978574569757483, + "loss": 3.4926, + "mean_token_accuracy": 0.35290902853012085, + "num_tokens": 485320754.0, + "step": 950 + }, + { + "epoch": 0.25716603569497026, + "grad_norm": 58.75, + "learning_rate": 0.01997846571665673, + "loss": 28.0436, + "mean_token_accuracy": 0.0, + "num_tokens": 485845024.0, + "step": 951 + }, + { + "epoch": 0.2574364521362899, + "grad_norm": 6.84375, + "learning_rate": 0.01997835658807025, + "loss": 4.1334, + "mean_token_accuracy": 0.3424582779407501, + "num_tokens": 486309674.0, + "step": 952 + }, + { + "epoch": 0.2577068685776095, + "grad_norm": 3.21875, + "learning_rate": 0.019978247184001385, + "loss": 3.4762, + "mean_token_accuracy": 0.3469051420688629, + "num_tokens": 486833962.0, + "step": 953 + }, + { + "epoch": 0.25797728501892914, + "grad_norm": 3.03125, + "learning_rate": 0.01997813750445349, + "loss": 3.3672, + "mean_token_accuracy": 0.37008509039878845, + "num_tokens": 487358151.0, + "step": 954 + }, + { + "epoch": 0.2582477014602488, + "grad_norm": 3.921875, + "learning_rate": 0.019978027549429934, + "loss": 3.6284, + "mean_token_accuracy": 0.3082292675971985, + "num_tokens": 487882414.0, + "step": 955 + }, + { + "epoch": 0.2585181179015684, + "grad_norm": 3.0, + "learning_rate": 0.019977917318934092, + "loss": 3.366, + "mean_token_accuracy": 0.3765069246292114, + "num_tokens": 488406494.0, + "step": 956 + }, + { + "epoch": 0.25878853434288807, + "grad_norm": 3.4375, + "learning_rate": 0.019977806812969342, + "loss": 3.5895, + "mean_token_accuracy": 0.34929776191711426, + "num_tokens": 488930552.0, + "step": 957 + }, + { + "epoch": 0.25905895078420765, + "grad_norm": 3.484375, + "learning_rate": 0.019977696031539083, + "loss": 3.2858, + "mean_token_accuracy": 0.3426187038421631, + "num_tokens": 489454645.0, + "step": 958 + }, + { + "epoch": 0.2593293672255273, + "grad_norm": 2.484375, + "learning_rate": 0.0199775849746467, + "loss": 3.5659, + "mean_token_accuracy": 0.35545140504837036, + "num_tokens": 489978898.0, + "step": 959 + }, + { + "epoch": 0.25959978366684694, + "grad_norm": 3.609375, + "learning_rate": 0.019977473642295615, + "loss": 3.6404, + "mean_token_accuracy": 0.33162325620651245, + "num_tokens": 490503034.0, + "step": 960 + }, + { + "epoch": 0.2598702001081666, + "grad_norm": 3.625, + "learning_rate": 0.019977362034489233, + "loss": 3.3798, + "mean_token_accuracy": 0.363447904586792, + "num_tokens": 491027305.0, + "step": 961 + }, + { + "epoch": 0.2601406165494862, + "grad_norm": 3.21875, + "learning_rate": 0.019977250151230988, + "loss": 3.1583, + "mean_token_accuracy": 0.3900560140609741, + "num_tokens": 491438782.0, + "step": 962 + }, + { + "epoch": 0.26041103299080587, + "grad_norm": 3.09375, + "learning_rate": 0.019977137992524305, + "loss": 3.6258, + "mean_token_accuracy": 0.35804885625839233, + "num_tokens": 491963056.0, + "step": 963 + }, + { + "epoch": 0.26068144943212546, + "grad_norm": 3.0625, + "learning_rate": 0.019977025558372625, + "loss": 3.4099, + "mean_token_accuracy": 0.3742004632949829, + "num_tokens": 492487317.0, + "step": 964 + }, + { + "epoch": 0.2609518658734451, + "grad_norm": 3.03125, + "learning_rate": 0.019976912848779404, + "loss": 3.6531, + "mean_token_accuracy": 0.35938358306884766, + "num_tokens": 493011488.0, + "step": 965 + }, + { + "epoch": 0.26122228231476474, + "grad_norm": 3.234375, + "learning_rate": 0.019976799863748097, + "loss": 3.5298, + "mean_token_accuracy": 0.36692118644714355, + "num_tokens": 493518605.0, + "step": 966 + }, + { + "epoch": 0.2614926987560844, + "grad_norm": 3.484375, + "learning_rate": 0.019976686603282175, + "loss": 3.4679, + "mean_token_accuracy": 0.37605392932891846, + "num_tokens": 494042775.0, + "step": 967 + }, + { + "epoch": 0.26176311519740403, + "grad_norm": 3.453125, + "learning_rate": 0.019976573067385105, + "loss": 3.5272, + "mean_token_accuracy": 0.3704160749912262, + "num_tokens": 494566966.0, + "step": 968 + }, + { + "epoch": 0.2620335316387236, + "grad_norm": 3.640625, + "learning_rate": 0.019976459256060376, + "loss": 3.4593, + "mean_token_accuracy": 0.3538719713687897, + "num_tokens": 495086973.0, + "step": 969 + }, + { + "epoch": 0.26230394808004326, + "grad_norm": 2.390625, + "learning_rate": 0.01997634516931148, + "loss": 3.3865, + "mean_token_accuracy": 0.38843584060668945, + "num_tokens": 495559277.0, + "step": 970 + }, + { + "epoch": 0.2625743645213629, + "grad_norm": 221.0, + "learning_rate": 0.019976230807141917, + "loss": 14.3433, + "mean_token_accuracy": 0.006863541901111603, + "num_tokens": 496083544.0, + "step": 971 + }, + { + "epoch": 0.26284478096268254, + "grad_norm": 10.875, + "learning_rate": 0.01997611616955519, + "loss": 4.4703, + "mean_token_accuracy": 0.23286721110343933, + "num_tokens": 496607817.0, + "step": 972 + }, + { + "epoch": 0.2631151974040022, + "grad_norm": 2.671875, + "learning_rate": 0.019976001256554826, + "loss": 3.6731, + "mean_token_accuracy": 0.3256494104862213, + "num_tokens": 497132026.0, + "step": 973 + }, + { + "epoch": 0.2633856138453218, + "grad_norm": 2.640625, + "learning_rate": 0.01997588606814435, + "loss": 3.5447, + "mean_token_accuracy": 0.3499599099159241, + "num_tokens": 497656242.0, + "step": 974 + }, + { + "epoch": 0.2636560302866414, + "grad_norm": 3.90625, + "learning_rate": 0.01997577060432729, + "loss": 3.4626, + "mean_token_accuracy": 0.3206809461116791, + "num_tokens": 498180401.0, + "step": 975 + }, + { + "epoch": 0.26392644672796106, + "grad_norm": 2.96875, + "learning_rate": 0.01997565486510719, + "loss": 3.479, + "mean_token_accuracy": 0.37274011969566345, + "num_tokens": 498704623.0, + "step": 976 + }, + { + "epoch": 0.2641968631692807, + "grad_norm": 4.0, + "learning_rate": 0.01997553885048761, + "loss": 3.563, + "mean_token_accuracy": 0.3370276689529419, + "num_tokens": 499228903.0, + "step": 977 + }, + { + "epoch": 0.26446727961060035, + "grad_norm": 3.171875, + "learning_rate": 0.019975422560472095, + "loss": 3.6199, + "mean_token_accuracy": 0.4151662588119507, + "num_tokens": 499642039.0, + "step": 978 + }, + { + "epoch": 0.26473769605191994, + "grad_norm": 3.390625, + "learning_rate": 0.01997530599506422, + "loss": 3.6512, + "mean_token_accuracy": 0.3241252899169922, + "num_tokens": 500166294.0, + "step": 979 + }, + { + "epoch": 0.2650081124932396, + "grad_norm": 2.609375, + "learning_rate": 0.019975189154267565, + "loss": 3.2922, + "mean_token_accuracy": 0.394553005695343, + "num_tokens": 500690464.0, + "step": 980 + }, + { + "epoch": 0.2652785289345592, + "grad_norm": 3.65625, + "learning_rate": 0.019975072038085715, + "loss": 3.0989, + "mean_token_accuracy": 0.39354208111763, + "num_tokens": 501214701.0, + "step": 981 + }, + { + "epoch": 0.26554894537587886, + "grad_norm": 3.015625, + "learning_rate": 0.01997495464652226, + "loss": 3.4169, + "mean_token_accuracy": 0.36887460947036743, + "num_tokens": 501738945.0, + "step": 982 + }, + { + "epoch": 0.2658193618171985, + "grad_norm": 3.515625, + "learning_rate": 0.0199748369795808, + "loss": 3.469, + "mean_token_accuracy": 0.35596323013305664, + "num_tokens": 502263180.0, + "step": 983 + }, + { + "epoch": 0.2660897782585181, + "grad_norm": 2.953125, + "learning_rate": 0.01997471903726495, + "loss": 3.2846, + "mean_token_accuracy": 0.3960581123828888, + "num_tokens": 502742436.0, + "step": 984 + }, + { + "epoch": 0.26636019469983774, + "grad_norm": 3.046875, + "learning_rate": 0.019974600819578327, + "loss": 3.4971, + "mean_token_accuracy": 0.3496885299682617, + "num_tokens": 503266672.0, + "step": 985 + }, + { + "epoch": 0.2666306111411574, + "grad_norm": 3.484375, + "learning_rate": 0.01997448232652456, + "loss": 3.5433, + "mean_token_accuracy": 0.3777344822883606, + "num_tokens": 503790878.0, + "step": 986 + }, + { + "epoch": 0.266901027582477, + "grad_norm": 5.125, + "learning_rate": 0.01997436355810728, + "loss": 3.4796, + "mean_token_accuracy": 0.3521711826324463, + "num_tokens": 504315023.0, + "step": 987 + }, + { + "epoch": 0.26717144402379667, + "grad_norm": 3.171875, + "learning_rate": 0.019974244514330136, + "loss": 3.3557, + "mean_token_accuracy": 0.3676270842552185, + "num_tokens": 504839205.0, + "step": 988 + }, + { + "epoch": 0.26744186046511625, + "grad_norm": 2.984375, + "learning_rate": 0.01997412519519678, + "loss": 3.474, + "mean_token_accuracy": 0.37715449929237366, + "num_tokens": 505363466.0, + "step": 989 + }, + { + "epoch": 0.2677122769064359, + "grad_norm": 2.875, + "learning_rate": 0.01997400560071087, + "loss": 3.2627, + "mean_token_accuracy": 0.36885470151901245, + "num_tokens": 505836433.0, + "step": 990 + }, + { + "epoch": 0.26798269334775554, + "grad_norm": 6.9375, + "learning_rate": 0.019973885730876075, + "loss": 9.7253, + "mean_token_accuracy": 0.013989973813295364, + "num_tokens": 506360707.0, + "step": 991 + }, + { + "epoch": 0.2682531097890752, + "grad_norm": 6.5, + "learning_rate": 0.019973765585696076, + "loss": 4.0852, + "mean_token_accuracy": 0.32154184579849243, + "num_tokens": 506884809.0, + "step": 992 + }, + { + "epoch": 0.2685235262303948, + "grad_norm": 2.515625, + "learning_rate": 0.019973645165174563, + "loss": 3.3587, + "mean_token_accuracy": 0.3658737540245056, + "num_tokens": 507397841.0, + "step": 993 + }, + { + "epoch": 0.2687939426717144, + "grad_norm": 3.078125, + "learning_rate": 0.019973524469315225, + "loss": 3.4715, + "mean_token_accuracy": 0.3574601411819458, + "num_tokens": 507922077.0, + "step": 994 + }, + { + "epoch": 0.26906435911303406, + "grad_norm": 2.765625, + "learning_rate": 0.019973403498121763, + "loss": 3.4153, + "mean_token_accuracy": 0.370627224445343, + "num_tokens": 508446245.0, + "step": 995 + }, + { + "epoch": 0.2693347755543537, + "grad_norm": 2.4375, + "learning_rate": 0.019973282251597897, + "loss": 3.3091, + "mean_token_accuracy": 0.36675429344177246, + "num_tokens": 508970452.0, + "step": 996 + }, + { + "epoch": 0.26960519199567334, + "grad_norm": 2.671875, + "learning_rate": 0.019973160729747343, + "loss": 3.2676, + "mean_token_accuracy": 0.384259432554245, + "num_tokens": 509494687.0, + "step": 997 + }, + { + "epoch": 0.269875608436993, + "grad_norm": 3.03125, + "learning_rate": 0.019973038932573826, + "loss": 3.5351, + "mean_token_accuracy": 0.35451585054397583, + "num_tokens": 510017750.0, + "step": 998 + }, + { + "epoch": 0.2701460248783126, + "grad_norm": 3.078125, + "learning_rate": 0.01997291686008109, + "loss": 3.5943, + "mean_token_accuracy": 0.36723798513412476, + "num_tokens": 510542027.0, + "step": 999 + }, + { + "epoch": 0.2704164413196322, + "grad_norm": 3.03125, + "learning_rate": 0.019972794512272875, + "loss": 3.3488, + "mean_token_accuracy": 0.3609820604324341, + "num_tokens": 511066175.0, + "step": 1000 + }, + { + "epoch": 0.27068685776095186, + "grad_norm": 2.953125, + "learning_rate": 0.019972671889152936, + "loss": 3.423, + "mean_token_accuracy": 0.3751611113548279, + "num_tokens": 511545902.0, + "step": 1001 + }, + { + "epoch": 0.2709572742022715, + "grad_norm": 2.65625, + "learning_rate": 0.01997254899072504, + "loss": 3.4396, + "mean_token_accuracy": 0.35366928577423096, + "num_tokens": 512070090.0, + "step": 1002 + }, + { + "epoch": 0.27122769064359115, + "grad_norm": 2.609375, + "learning_rate": 0.01997242581699295, + "loss": 3.5753, + "mean_token_accuracy": 0.33924931287765503, + "num_tokens": 512594182.0, + "step": 1003 + }, + { + "epoch": 0.2714981070849108, + "grad_norm": 2.78125, + "learning_rate": 0.019972302367960455, + "loss": 3.3039, + "mean_token_accuracy": 0.357174813747406, + "num_tokens": 513075510.0, + "step": 1004 + }, + { + "epoch": 0.2717685235262304, + "grad_norm": 2.90625, + "learning_rate": 0.019972178643631332, + "loss": 3.417, + "mean_token_accuracy": 0.3707423806190491, + "num_tokens": 513599793.0, + "step": 1005 + }, + { + "epoch": 0.27203893996755, + "grad_norm": 3.0625, + "learning_rate": 0.019972054644009387, + "loss": 3.1799, + "mean_token_accuracy": 0.3800102770328522, + "num_tokens": 514030215.0, + "step": 1006 + }, + { + "epoch": 0.27230935640886966, + "grad_norm": 2.171875, + "learning_rate": 0.019971930369098417, + "loss": 3.3092, + "mean_token_accuracy": 0.37142786383628845, + "num_tokens": 514554482.0, + "step": 1007 + }, + { + "epoch": 0.2725797728501893, + "grad_norm": 2.96875, + "learning_rate": 0.019971805818902243, + "loss": 3.4132, + "mean_token_accuracy": 0.35780835151672363, + "num_tokens": 515078753.0, + "step": 1008 + }, + { + "epoch": 0.27285018929150895, + "grad_norm": 3.265625, + "learning_rate": 0.019971680993424678, + "loss": 3.3304, + "mean_token_accuracy": 0.40200889110565186, + "num_tokens": 515602847.0, + "step": 1009 + }, + { + "epoch": 0.27312060573282854, + "grad_norm": 2.953125, + "learning_rate": 0.01997155589266956, + "loss": 3.4986, + "mean_token_accuracy": 0.36068758368492126, + "num_tokens": 516127062.0, + "step": 1010 + }, + { + "epoch": 0.2733910221741482, + "grad_norm": 157.0, + "learning_rate": 0.019971430516640718, + "loss": 22.3593, + "mean_token_accuracy": 0.0, + "num_tokens": 516572016.0, + "step": 1011 + }, + { + "epoch": 0.2736614386154678, + "grad_norm": 8.0625, + "learning_rate": 0.019971304865342012, + "loss": 4.1497, + "mean_token_accuracy": 0.3125190734863281, + "num_tokens": 517096297.0, + "step": 1012 + }, + { + "epoch": 0.27393185505678747, + "grad_norm": 3.4375, + "learning_rate": 0.01997117893877728, + "loss": 3.7953, + "mean_token_accuracy": 0.35357993841171265, + "num_tokens": 517620510.0, + "step": 1013 + }, + { + "epoch": 0.2742022714981071, + "grad_norm": 3.59375, + "learning_rate": 0.019971052736950407, + "loss": 3.4139, + "mean_token_accuracy": 0.34366247057914734, + "num_tokens": 518130177.0, + "step": 1014 + }, + { + "epoch": 0.2744726879394267, + "grad_norm": 2.78125, + "learning_rate": 0.019970926259865244, + "loss": 3.6619, + "mean_token_accuracy": 0.3553948998451233, + "num_tokens": 518589938.0, + "step": 1015 + }, + { + "epoch": 0.27474310438074634, + "grad_norm": 4.15625, + "learning_rate": 0.01997079950752569, + "loss": 3.5203, + "mean_token_accuracy": 0.371063768863678, + "num_tokens": 519036705.0, + "step": 1016 + }, + { + "epoch": 0.275013520822066, + "grad_norm": 18.125, + "learning_rate": 0.019970672479935617, + "loss": 3.4414, + "mean_token_accuracy": 0.3667314946651459, + "num_tokens": 519560890.0, + "step": 1017 + }, + { + "epoch": 0.2752839372633856, + "grad_norm": 2.875, + "learning_rate": 0.01997054517709894, + "loss": 3.7269, + "mean_token_accuracy": 0.366962194442749, + "num_tokens": 520043789.0, + "step": 1018 + }, + { + "epoch": 0.27555435370470527, + "grad_norm": 2.125, + "learning_rate": 0.019970417599019546, + "loss": 3.5529, + "mean_token_accuracy": 0.3619183599948883, + "num_tokens": 520568000.0, + "step": 1019 + }, + { + "epoch": 0.27582477014602486, + "grad_norm": 2.6875, + "learning_rate": 0.019970289745701367, + "loss": 3.2978, + "mean_token_accuracy": 0.37326937913894653, + "num_tokens": 521089934.0, + "step": 1020 + }, + { + "epoch": 0.2760951865873445, + "grad_norm": 2.5, + "learning_rate": 0.01997016161714832, + "loss": 3.5069, + "mean_token_accuracy": 0.3627729117870331, + "num_tokens": 521614136.0, + "step": 1021 + }, + { + "epoch": 0.27636560302866414, + "grad_norm": 3.453125, + "learning_rate": 0.01997003321336433, + "loss": 3.4471, + "mean_token_accuracy": 0.3541688323020935, + "num_tokens": 522138376.0, + "step": 1022 + }, + { + "epoch": 0.2766360194699838, + "grad_norm": 5.90625, + "learning_rate": 0.019969904534353346, + "loss": 3.2742, + "mean_token_accuracy": 0.346468985080719, + "num_tokens": 522662607.0, + "step": 1023 + }, + { + "epoch": 0.27690643591130343, + "grad_norm": 2.5, + "learning_rate": 0.019969775580119312, + "loss": 3.3905, + "mean_token_accuracy": 0.3878742456436157, + "num_tokens": 523175459.0, + "step": 1024 + }, + { + "epoch": 0.277176852352623, + "grad_norm": 3.5625, + "learning_rate": 0.01996964635066618, + "loss": 3.5701, + "mean_token_accuracy": 0.3552670478820801, + "num_tokens": 523681639.0, + "step": 1025 + }, + { + "epoch": 0.27744726879394266, + "grad_norm": 3.25, + "learning_rate": 0.019969516845997928, + "loss": 3.4121, + "mean_token_accuracy": 0.3748547434806824, + "num_tokens": 524205841.0, + "step": 1026 + }, + { + "epoch": 0.2777176852352623, + "grad_norm": 2.984375, + "learning_rate": 0.019969387066118515, + "loss": 3.2336, + "mean_token_accuracy": 0.3803238868713379, + "num_tokens": 524730110.0, + "step": 1027 + }, + { + "epoch": 0.27798810167658194, + "grad_norm": 3.0, + "learning_rate": 0.01996925701103193, + "loss": 3.3465, + "mean_token_accuracy": 0.37089505791664124, + "num_tokens": 525254302.0, + "step": 1028 + }, + { + "epoch": 0.2782585181179016, + "grad_norm": 3.71875, + "learning_rate": 0.019969126680742164, + "loss": 3.5414, + "mean_token_accuracy": 0.37409508228302, + "num_tokens": 525777346.0, + "step": 1029 + }, + { + "epoch": 0.2785289345592212, + "grad_norm": 3.203125, + "learning_rate": 0.01996899607525322, + "loss": 3.5345, + "mean_token_accuracy": 0.346515417098999, + "num_tokens": 526301621.0, + "step": 1030 + }, + { + "epoch": 0.2787993510005408, + "grad_norm": 160.0, + "learning_rate": 0.01996886519456909, + "loss": 13.1875, + "mean_token_accuracy": 0.00013767051859758794, + "num_tokens": 526825804.0, + "step": 1031 + }, + { + "epoch": 0.27906976744186046, + "grad_norm": 8.0625, + "learning_rate": 0.019968734038693807, + "loss": 4.2177, + "mean_token_accuracy": 0.29391953349113464, + "num_tokens": 527349965.0, + "step": 1032 + }, + { + "epoch": 0.2793401838831801, + "grad_norm": 3.21875, + "learning_rate": 0.019968602607631388, + "loss": 3.5285, + "mean_token_accuracy": 0.35463225841522217, + "num_tokens": 527874249.0, + "step": 1033 + }, + { + "epoch": 0.27961060032449975, + "grad_norm": 3.453125, + "learning_rate": 0.019968470901385864, + "loss": 3.6587, + "mean_token_accuracy": 0.36874011158943176, + "num_tokens": 528356667.0, + "step": 1034 + }, + { + "epoch": 0.27988101676581933, + "grad_norm": 3.625, + "learning_rate": 0.01996833891996128, + "loss": 3.715, + "mean_token_accuracy": 0.36048850417137146, + "num_tokens": 528880950.0, + "step": 1035 + }, + { + "epoch": 0.280151433207139, + "grad_norm": 3.140625, + "learning_rate": 0.019968206663361684, + "loss": 3.6161, + "mean_token_accuracy": 0.3352564871311188, + "num_tokens": 529405201.0, + "step": 1036 + }, + { + "epoch": 0.2804218496484586, + "grad_norm": 3.03125, + "learning_rate": 0.019968074131591133, + "loss": 3.509, + "mean_token_accuracy": 0.36464670300483704, + "num_tokens": 529929422.0, + "step": 1037 + }, + { + "epoch": 0.28069226608977826, + "grad_norm": 3.609375, + "learning_rate": 0.019967941324653692, + "loss": 3.6232, + "mean_token_accuracy": 0.35457152128219604, + "num_tokens": 530448911.0, + "step": 1038 + }, + { + "epoch": 0.2809626825310979, + "grad_norm": 2.953125, + "learning_rate": 0.01996780824255344, + "loss": 3.4785, + "mean_token_accuracy": 0.36313700675964355, + "num_tokens": 530973140.0, + "step": 1039 + }, + { + "epoch": 0.28123309897241755, + "grad_norm": 4.375, + "learning_rate": 0.019967674885294464, + "loss": 3.4125, + "mean_token_accuracy": 0.3648623824119568, + "num_tokens": 531497287.0, + "step": 1040 + }, + { + "epoch": 0.28150351541373714, + "grad_norm": 2.578125, + "learning_rate": 0.019967541252880843, + "loss": 3.4808, + "mean_token_accuracy": 0.3761240243911743, + "num_tokens": 531978201.0, + "step": 1041 + }, + { + "epoch": 0.2817739318550568, + "grad_norm": 2.703125, + "learning_rate": 0.019967407345316684, + "loss": 3.2689, + "mean_token_accuracy": 0.3726884722709656, + "num_tokens": 532502381.0, + "step": 1042 + }, + { + "epoch": 0.2820443482963764, + "grad_norm": 2.515625, + "learning_rate": 0.0199672731626061, + "loss": 3.2298, + "mean_token_accuracy": 0.3687710165977478, + "num_tokens": 533026651.0, + "step": 1043 + }, + { + "epoch": 0.28231476473769607, + "grad_norm": 2.765625, + "learning_rate": 0.019967138704753203, + "loss": 3.5588, + "mean_token_accuracy": 0.3617037236690521, + "num_tokens": 533550878.0, + "step": 1044 + }, + { + "epoch": 0.2825851811790157, + "grad_norm": 2.90625, + "learning_rate": 0.019967003971762123, + "loss": 3.3487, + "mean_token_accuracy": 0.3686767816543579, + "num_tokens": 534075035.0, + "step": 1045 + }, + { + "epoch": 0.2828555976203353, + "grad_norm": 3.015625, + "learning_rate": 0.01996686896363699, + "loss": 3.489, + "mean_token_accuracy": 0.37161368131637573, + "num_tokens": 534599237.0, + "step": 1046 + }, + { + "epoch": 0.28312601406165494, + "grad_norm": 3.078125, + "learning_rate": 0.019966733680381942, + "loss": 3.6193, + "mean_token_accuracy": 0.34832435846328735, + "num_tokens": 535123447.0, + "step": 1047 + }, + { + "epoch": 0.2833964305029746, + "grad_norm": 3.015625, + "learning_rate": 0.019966598122001137, + "loss": 3.4632, + "mean_token_accuracy": 0.38296258449554443, + "num_tokens": 535614117.0, + "step": 1048 + }, + { + "epoch": 0.2836668469442942, + "grad_norm": 3.859375, + "learning_rate": 0.019966462288498736, + "loss": 3.3155, + "mean_token_accuracy": 0.3866822123527527, + "num_tokens": 536138232.0, + "step": 1049 + }, + { + "epoch": 0.28393726338561387, + "grad_norm": 2.234375, + "learning_rate": 0.019966326179878903, + "loss": 3.37, + "mean_token_accuracy": 0.3786948323249817, + "num_tokens": 536662375.0, + "step": 1050 + }, + { + "epoch": 0.28420767982693346, + "grad_norm": 90.5, + "learning_rate": 0.01996618979614581, + "loss": 15.5303, + "mean_token_accuracy": 0.0, + "num_tokens": 537162363.0, + "step": 1051 + }, + { + "epoch": 0.2844780962682531, + "grad_norm": 8.8125, + "learning_rate": 0.01996605313730365, + "loss": 4.1103, + "mean_token_accuracy": 0.30958813428878784, + "num_tokens": 537624821.0, + "step": 1052 + }, + { + "epoch": 0.28474851270957274, + "grad_norm": 2.3125, + "learning_rate": 0.019965916203356613, + "loss": 3.4725, + "mean_token_accuracy": 0.35360684990882874, + "num_tokens": 538148838.0, + "step": 1053 + }, + { + "epoch": 0.2850189291508924, + "grad_norm": 2.796875, + "learning_rate": 0.0199657789943089, + "loss": 3.3563, + "mean_token_accuracy": 0.3452944755554199, + "num_tokens": 538672933.0, + "step": 1054 + }, + { + "epoch": 0.28528934559221203, + "grad_norm": 3.875, + "learning_rate": 0.01996564151016472, + "loss": 3.5981, + "mean_token_accuracy": 0.32195931673049927, + "num_tokens": 539197057.0, + "step": 1055 + }, + { + "epoch": 0.2855597620335316, + "grad_norm": 3.3125, + "learning_rate": 0.01996550375092829, + "loss": 3.5715, + "mean_token_accuracy": 0.352703720331192, + "num_tokens": 539721334.0, + "step": 1056 + }, + { + "epoch": 0.28583017847485126, + "grad_norm": 3.765625, + "learning_rate": 0.01996536571660384, + "loss": 3.515, + "mean_token_accuracy": 0.3720250725746155, + "num_tokens": 540245579.0, + "step": 1057 + }, + { + "epoch": 0.2861005949161709, + "grad_norm": 3.265625, + "learning_rate": 0.019965227407195606, + "loss": 3.3116, + "mean_token_accuracy": 0.38977545499801636, + "num_tokens": 540705166.0, + "step": 1058 + }, + { + "epoch": 0.28637101135749055, + "grad_norm": 2.546875, + "learning_rate": 0.019965088822707828, + "loss": 3.4447, + "mean_token_accuracy": 0.36851295828819275, + "num_tokens": 541229420.0, + "step": 1059 + }, + { + "epoch": 0.2866414277988102, + "grad_norm": 2.828125, + "learning_rate": 0.019964949963144765, + "loss": 3.4619, + "mean_token_accuracy": 0.3578834533691406, + "num_tokens": 541753564.0, + "step": 1060 + }, + { + "epoch": 0.2869118442401298, + "grad_norm": 3.1875, + "learning_rate": 0.019964810828510666, + "loss": 3.3853, + "mean_token_accuracy": 0.37183767557144165, + "num_tokens": 542277723.0, + "step": 1061 + }, + { + "epoch": 0.2871822606814494, + "grad_norm": 3.765625, + "learning_rate": 0.01996467141880981, + "loss": 3.5609, + "mean_token_accuracy": 0.3430810570716858, + "num_tokens": 542801967.0, + "step": 1062 + }, + { + "epoch": 0.28745267712276906, + "grad_norm": 3.0, + "learning_rate": 0.019964531734046474, + "loss": 3.6109, + "mean_token_accuracy": 0.36679255962371826, + "num_tokens": 543326220.0, + "step": 1063 + }, + { + "epoch": 0.2877230935640887, + "grad_norm": 4.0, + "learning_rate": 0.019964391774224936, + "loss": 3.6143, + "mean_token_accuracy": 0.3401331901550293, + "num_tokens": 543806342.0, + "step": 1064 + }, + { + "epoch": 0.28799351000540835, + "grad_norm": 2.765625, + "learning_rate": 0.0199642515393495, + "loss": 3.4637, + "mean_token_accuracy": 0.36863037943840027, + "num_tokens": 544330481.0, + "step": 1065 + }, + { + "epoch": 0.28826392644672794, + "grad_norm": 2.15625, + "learning_rate": 0.019964111029424466, + "loss": 3.3275, + "mean_token_accuracy": 0.36168670654296875, + "num_tokens": 544854567.0, + "step": 1066 + }, + { + "epoch": 0.2885343428880476, + "grad_norm": 2.5, + "learning_rate": 0.019963970244454142, + "loss": 3.4194, + "mean_token_accuracy": 0.3727010488510132, + "num_tokens": 545378739.0, + "step": 1067 + }, + { + "epoch": 0.2888047593293672, + "grad_norm": 2.515625, + "learning_rate": 0.01996382918444285, + "loss": 3.341, + "mean_token_accuracy": 0.3541748523712158, + "num_tokens": 545903003.0, + "step": 1068 + }, + { + "epoch": 0.28907517577068687, + "grad_norm": 3.125, + "learning_rate": 0.019963687849394916, + "loss": 3.3003, + "mean_token_accuracy": 0.3999950885772705, + "num_tokens": 546427287.0, + "step": 1069 + }, + { + "epoch": 0.2893455922120065, + "grad_norm": 3.40625, + "learning_rate": 0.019963546239314678, + "loss": 3.4463, + "mean_token_accuracy": 0.35359781980514526, + "num_tokens": 546951324.0, + "step": 1070 + }, + { + "epoch": 0.2896160086533261, + "grad_norm": 190.0, + "learning_rate": 0.019963404354206484, + "loss": 18.2223, + "mean_token_accuracy": 0.009397082030773163, + "num_tokens": 547475586.0, + "step": 1071 + }, + { + "epoch": 0.28988642509464574, + "grad_norm": 11.625, + "learning_rate": 0.01996326219407468, + "loss": 4.6125, + "mean_token_accuracy": 0.317538321018219, + "num_tokens": 547999827.0, + "step": 1072 + }, + { + "epoch": 0.2901568415359654, + "grad_norm": 2.484375, + "learning_rate": 0.01996311975892364, + "loss": 3.3054, + "mean_token_accuracy": 0.3761722445487976, + "num_tokens": 548524034.0, + "step": 1073 + }, + { + "epoch": 0.290427257977285, + "grad_norm": 3.234375, + "learning_rate": 0.01996297704875772, + "loss": 3.5085, + "mean_token_accuracy": 0.34062790870666504, + "num_tokens": 549048133.0, + "step": 1074 + }, + { + "epoch": 0.29069767441860467, + "grad_norm": 4.15625, + "learning_rate": 0.01996283406358131, + "loss": 3.4765, + "mean_token_accuracy": 0.3441827595233917, + "num_tokens": 549572389.0, + "step": 1075 + }, + { + "epoch": 0.2909680908599243, + "grad_norm": 3.078125, + "learning_rate": 0.01996269080339879, + "loss": 3.5238, + "mean_token_accuracy": 0.35628747940063477, + "num_tokens": 550096593.0, + "step": 1076 + }, + { + "epoch": 0.2912385073012439, + "grad_norm": 3.40625, + "learning_rate": 0.019962547268214562, + "loss": 3.7079, + "mean_token_accuracy": 0.3457019329071045, + "num_tokens": 550620878.0, + "step": 1077 + }, + { + "epoch": 0.29150892374256354, + "grad_norm": 3.765625, + "learning_rate": 0.019962403458033025, + "loss": 3.5761, + "mean_token_accuracy": 0.3534322679042816, + "num_tokens": 551145061.0, + "step": 1078 + }, + { + "epoch": 0.2917793401838832, + "grad_norm": 4.5625, + "learning_rate": 0.019962259372858594, + "loss": 3.797, + "mean_token_accuracy": 0.3398037552833557, + "num_tokens": 551639815.0, + "step": 1079 + }, + { + "epoch": 0.2920497566252028, + "grad_norm": 3.3125, + "learning_rate": 0.019962115012695687, + "loss": 3.4584, + "mean_token_accuracy": 0.3496975600719452, + "num_tokens": 552164070.0, + "step": 1080 + }, + { + "epoch": 0.29232017306652247, + "grad_norm": 2.640625, + "learning_rate": 0.019961970377548738, + "loss": 3.4441, + "mean_token_accuracy": 0.3712320923805237, + "num_tokens": 552688180.0, + "step": 1081 + }, + { + "epoch": 0.29259058950784206, + "grad_norm": 2.46875, + "learning_rate": 0.01996182546742218, + "loss": 3.3129, + "mean_token_accuracy": 0.3615366816520691, + "num_tokens": 553212365.0, + "step": 1082 + }, + { + "epoch": 0.2928610059491617, + "grad_norm": 2.90625, + "learning_rate": 0.019961680282320465, + "loss": 3.6573, + "mean_token_accuracy": 0.34552133083343506, + "num_tokens": 553736607.0, + "step": 1083 + }, + { + "epoch": 0.29313142239048134, + "grad_norm": 3.78125, + "learning_rate": 0.019961534822248043, + "loss": 3.346, + "mean_token_accuracy": 0.3561801314353943, + "num_tokens": 554223058.0, + "step": 1084 + }, + { + "epoch": 0.293401838831801, + "grad_norm": 2.296875, + "learning_rate": 0.019961389087209375, + "loss": 3.1242, + "mean_token_accuracy": 0.38407477736473083, + "num_tokens": 554747299.0, + "step": 1085 + }, + { + "epoch": 0.29367225527312063, + "grad_norm": 2.9375, + "learning_rate": 0.019961243077208937, + "loss": 3.6476, + "mean_token_accuracy": 0.33783939480781555, + "num_tokens": 555271569.0, + "step": 1086 + }, + { + "epoch": 0.2939426717144402, + "grad_norm": 3.296875, + "learning_rate": 0.01996109679225121, + "loss": 3.6005, + "mean_token_accuracy": 0.35624080896377563, + "num_tokens": 555795831.0, + "step": 1087 + }, + { + "epoch": 0.29421308815575986, + "grad_norm": 2.828125, + "learning_rate": 0.019960950232340683, + "loss": 3.2528, + "mean_token_accuracy": 0.3865838646888733, + "num_tokens": 556319938.0, + "step": 1088 + }, + { + "epoch": 0.2944835045970795, + "grad_norm": 2.953125, + "learning_rate": 0.01996080339748185, + "loss": 3.2864, + "mean_token_accuracy": 0.3707638680934906, + "num_tokens": 556813948.0, + "step": 1089 + }, + { + "epoch": 0.29475392103839915, + "grad_norm": 3.125, + "learning_rate": 0.01996065628767921, + "loss": 3.3064, + "mean_token_accuracy": 0.38322192430496216, + "num_tokens": 557332424.0, + "step": 1090 + }, + { + "epoch": 0.2950243374797188, + "grad_norm": 20.25, + "learning_rate": 0.01996050890293729, + "loss": 12.525, + "mean_token_accuracy": 0.0, + "num_tokens": 557856701.0, + "step": 1091 + }, + { + "epoch": 0.2952947539210384, + "grad_norm": 5.78125, + "learning_rate": 0.019960361243260602, + "loss": 3.7425, + "mean_token_accuracy": 0.31991711258888245, + "num_tokens": 558380791.0, + "step": 1092 + }, + { + "epoch": 0.295565170362358, + "grad_norm": 2.375, + "learning_rate": 0.019960213308653683, + "loss": 3.6471, + "mean_token_accuracy": 0.3570716679096222, + "num_tokens": 558904961.0, + "step": 1093 + }, + { + "epoch": 0.29583558680367766, + "grad_norm": 3.34375, + "learning_rate": 0.019960065099121068, + "loss": 3.5479, + "mean_token_accuracy": 0.35218843817710876, + "num_tokens": 559419423.0, + "step": 1094 + }, + { + "epoch": 0.2961060032449973, + "grad_norm": 2.84375, + "learning_rate": 0.019959916614667308, + "loss": 3.4562, + "mean_token_accuracy": 0.3671489357948303, + "num_tokens": 559943589.0, + "step": 1095 + }, + { + "epoch": 0.29637641968631695, + "grad_norm": 2.609375, + "learning_rate": 0.019959767855296955, + "loss": 3.4679, + "mean_token_accuracy": 0.3659253418445587, + "num_tokens": 560467870.0, + "step": 1096 + }, + { + "epoch": 0.29664683612763654, + "grad_norm": 2.828125, + "learning_rate": 0.019959618821014574, + "loss": 3.1774, + "mean_token_accuracy": 0.38758784532546997, + "num_tokens": 560943870.0, + "step": 1097 + }, + { + "epoch": 0.2969172525689562, + "grad_norm": 4.375, + "learning_rate": 0.019959469511824743, + "loss": 3.4921, + "mean_token_accuracy": 0.3916931748390198, + "num_tokens": 561412372.0, + "step": 1098 + }, + { + "epoch": 0.2971876690102758, + "grad_norm": 4.03125, + "learning_rate": 0.019959319927732037, + "loss": 3.6297, + "mean_token_accuracy": 0.34842371940612793, + "num_tokens": 561894797.0, + "step": 1099 + }, + { + "epoch": 0.29745808545159547, + "grad_norm": 2.796875, + "learning_rate": 0.01995917006874105, + "loss": 3.4624, + "mean_token_accuracy": 0.39110320806503296, + "num_tokens": 562376250.0, + "step": 1100 + }, + { + "epoch": 0.2977285018929151, + "grad_norm": 3.3125, + "learning_rate": 0.019959019934856374, + "loss": 3.5449, + "mean_token_accuracy": 0.37169206142425537, + "num_tokens": 562900257.0, + "step": 1101 + }, + { + "epoch": 0.2979989183342347, + "grad_norm": 3.265625, + "learning_rate": 0.019958869526082623, + "loss": 3.6369, + "mean_token_accuracy": 0.35348284244537354, + "num_tokens": 563424524.0, + "step": 1102 + }, + { + "epoch": 0.29826933477555434, + "grad_norm": 3.421875, + "learning_rate": 0.019958718842424405, + "loss": 3.1401, + "mean_token_accuracy": 0.43806856870651245, + "num_tokens": 563926149.0, + "step": 1103 + }, + { + "epoch": 0.298539751216874, + "grad_norm": 3.265625, + "learning_rate": 0.01995856788388635, + "loss": 3.5021, + "mean_token_accuracy": 0.3501567840576172, + "num_tokens": 564450325.0, + "step": 1104 + }, + { + "epoch": 0.2988101676581936, + "grad_norm": 3.125, + "learning_rate": 0.019958416650473085, + "loss": 3.3889, + "mean_token_accuracy": 0.38026756048202515, + "num_tokens": 564974590.0, + "step": 1105 + }, + { + "epoch": 0.29908058409951327, + "grad_norm": 3.0, + "learning_rate": 0.019958265142189253, + "loss": 3.4281, + "mean_token_accuracy": 0.35327810049057007, + "num_tokens": 565498845.0, + "step": 1106 + }, + { + "epoch": 0.29935100054083286, + "grad_norm": 2.265625, + "learning_rate": 0.019958113359039506, + "loss": 3.2708, + "mean_token_accuracy": 0.3865453898906708, + "num_tokens": 566023076.0, + "step": 1107 + }, + { + "epoch": 0.2996214169821525, + "grad_norm": 2.40625, + "learning_rate": 0.01995796130102849, + "loss": 3.2397, + "mean_token_accuracy": 0.39339351654052734, + "num_tokens": 566547347.0, + "step": 1108 + }, + { + "epoch": 0.29989183342347214, + "grad_norm": 2.734375, + "learning_rate": 0.019957808968160882, + "loss": 3.4384, + "mean_token_accuracy": 0.37583494186401367, + "num_tokens": 567071562.0, + "step": 1109 + }, + { + "epoch": 0.3001622498647918, + "grad_norm": 2.6875, + "learning_rate": 0.01995765636044135, + "loss": 3.4281, + "mean_token_accuracy": 0.3561092019081116, + "num_tokens": 567595741.0, + "step": 1110 + }, + { + "epoch": 0.30043266630611143, + "grad_norm": 1.609375, + "learning_rate": 0.019957503477874578, + "loss": 11.091, + "mean_token_accuracy": 2.9879574867663905e-05, + "num_tokens": 568119989.0, + "step": 1111 + }, + { + "epoch": 0.300703082747431, + "grad_norm": 9.4375, + "learning_rate": 0.01995735032046526, + "loss": 4.4901, + "mean_token_accuracy": 0.24019919335842133, + "num_tokens": 568644109.0, + "step": 1112 + }, + { + "epoch": 0.30097349918875066, + "grad_norm": 3.234375, + "learning_rate": 0.01995719688821809, + "loss": 3.6947, + "mean_token_accuracy": 0.3304447531700134, + "num_tokens": 569168361.0, + "step": 1113 + }, + { + "epoch": 0.3012439156300703, + "grad_norm": 2.671875, + "learning_rate": 0.019957043181137776, + "loss": 3.4792, + "mean_token_accuracy": 0.358177125453949, + "num_tokens": 569692598.0, + "step": 1114 + }, + { + "epoch": 0.30151433207138995, + "grad_norm": 3.609375, + "learning_rate": 0.01995688919922904, + "loss": 3.7605, + "mean_token_accuracy": 0.3230528235435486, + "num_tokens": 570216718.0, + "step": 1115 + }, + { + "epoch": 0.3017847485127096, + "grad_norm": 3.25, + "learning_rate": 0.019956734942496602, + "loss": 3.2873, + "mean_token_accuracy": 0.35426896810531616, + "num_tokens": 570718535.0, + "step": 1116 + }, + { + "epoch": 0.30205516495402923, + "grad_norm": 3.0, + "learning_rate": 0.019956580410945193, + "loss": 3.5562, + "mean_token_accuracy": 0.3612145781517029, + "num_tokens": 571242748.0, + "step": 1117 + }, + { + "epoch": 0.3023255813953488, + "grad_norm": 3.296875, + "learning_rate": 0.01995642560457956, + "loss": 3.6316, + "mean_token_accuracy": 0.3469327390193939, + "num_tokens": 571766896.0, + "step": 1118 + }, + { + "epoch": 0.30259599783666846, + "grad_norm": 3.0, + "learning_rate": 0.01995627052340445, + "loss": 3.4074, + "mean_token_accuracy": 0.3800327181816101, + "num_tokens": 572288905.0, + "step": 1119 + }, + { + "epoch": 0.3028664142779881, + "grad_norm": 4.3125, + "learning_rate": 0.019956115167424623, + "loss": 3.6701, + "mean_token_accuracy": 0.33112967014312744, + "num_tokens": 572813066.0, + "step": 1120 + }, + { + "epoch": 0.30313683071930775, + "grad_norm": 2.65625, + "learning_rate": 0.019955959536644842, + "loss": 3.4001, + "mean_token_accuracy": 0.37888842821121216, + "num_tokens": 573302793.0, + "step": 1121 + }, + { + "epoch": 0.3034072471606274, + "grad_norm": 3.15625, + "learning_rate": 0.019955803631069885, + "loss": 3.7251, + "mean_token_accuracy": 0.31316372752189636, + "num_tokens": 573826962.0, + "step": 1122 + }, + { + "epoch": 0.303677663601947, + "grad_norm": 2.515625, + "learning_rate": 0.019955647450704536, + "loss": 3.4489, + "mean_token_accuracy": 0.38240575790405273, + "num_tokens": 574338996.0, + "step": 1123 + }, + { + "epoch": 0.3039480800432666, + "grad_norm": 4.03125, + "learning_rate": 0.01995549099555359, + "loss": 3.3967, + "mean_token_accuracy": 0.36553701758384705, + "num_tokens": 574863252.0, + "step": 1124 + }, + { + "epoch": 0.30421849648458626, + "grad_norm": 4.59375, + "learning_rate": 0.019955334265621835, + "loss": 3.7056, + "mean_token_accuracy": 0.3481721878051758, + "num_tokens": 575387514.0, + "step": 1125 + }, + { + "epoch": 0.3044889129259059, + "grad_norm": 2.859375, + "learning_rate": 0.019955177260914097, + "loss": 3.288, + "mean_token_accuracy": 0.3996143937110901, + "num_tokens": 575851180.0, + "step": 1126 + }, + { + "epoch": 0.30475932936722555, + "grad_norm": 2.96875, + "learning_rate": 0.01995501998143518, + "loss": 3.2581, + "mean_token_accuracy": 0.3817344903945923, + "num_tokens": 576375183.0, + "step": 1127 + }, + { + "epoch": 0.30502974580854514, + "grad_norm": 2.84375, + "learning_rate": 0.01995486242718992, + "loss": 3.4327, + "mean_token_accuracy": 0.3650586009025574, + "num_tokens": 576899372.0, + "step": 1128 + }, + { + "epoch": 0.3053001622498648, + "grad_norm": 2.984375, + "learning_rate": 0.019954704598183142, + "loss": 3.2047, + "mean_token_accuracy": 0.396794855594635, + "num_tokens": 577423611.0, + "step": 1129 + }, + { + "epoch": 0.3055705786911844, + "grad_norm": 3.390625, + "learning_rate": 0.019954546494419693, + "loss": 3.2776, + "mean_token_accuracy": 0.38184481859207153, + "num_tokens": 577859345.0, + "step": 1130 + }, + { + "epoch": 0.30584099513250407, + "grad_norm": 232.0, + "learning_rate": 0.019954388115904425, + "loss": 19.9696, + "mean_token_accuracy": 0.0004631740157492459, + "num_tokens": 578383550.0, + "step": 1131 + }, + { + "epoch": 0.3061114115738237, + "grad_norm": 7.5625, + "learning_rate": 0.019954229462642196, + "loss": 3.8307, + "mean_token_accuracy": 0.33795005083084106, + "num_tokens": 578907835.0, + "step": 1132 + }, + { + "epoch": 0.3063818280151433, + "grad_norm": 2.453125, + "learning_rate": 0.019954070534637876, + "loss": 3.5753, + "mean_token_accuracy": 0.34726160764694214, + "num_tokens": 579432104.0, + "step": 1133 + }, + { + "epoch": 0.30665224445646294, + "grad_norm": 3.5, + "learning_rate": 0.019953911331896338, + "loss": 3.5402, + "mean_token_accuracy": 0.3686654567718506, + "num_tokens": 579956350.0, + "step": 1134 + }, + { + "epoch": 0.3069226608977826, + "grad_norm": 3.015625, + "learning_rate": 0.019953751854422472, + "loss": 3.4488, + "mean_token_accuracy": 0.35424235463142395, + "num_tokens": 580480581.0, + "step": 1135 + }, + { + "epoch": 0.3071930773391022, + "grad_norm": 3.390625, + "learning_rate": 0.019953592102221165, + "loss": 3.7075, + "mean_token_accuracy": 0.3320730924606323, + "num_tokens": 581004708.0, + "step": 1136 + }, + { + "epoch": 0.30746349378042187, + "grad_norm": 2.8125, + "learning_rate": 0.01995343207529732, + "loss": 3.4506, + "mean_token_accuracy": 0.3769141733646393, + "num_tokens": 581528796.0, + "step": 1137 + }, + { + "epoch": 0.30773391022174146, + "grad_norm": 3.390625, + "learning_rate": 0.019953271773655853, + "loss": 3.2736, + "mean_token_accuracy": 0.42435604333877563, + "num_tokens": 582052983.0, + "step": 1138 + }, + { + "epoch": 0.3080043266630611, + "grad_norm": 3.78125, + "learning_rate": 0.019953111197301674, + "loss": 3.4649, + "mean_token_accuracy": 0.3515176773071289, + "num_tokens": 582577257.0, + "step": 1139 + }, + { + "epoch": 0.30827474310438074, + "grad_norm": 2.96875, + "learning_rate": 0.019952950346239718, + "loss": 3.4184, + "mean_token_accuracy": 0.37261009216308594, + "num_tokens": 583084560.0, + "step": 1140 + }, + { + "epoch": 0.3085451595457004, + "grad_norm": 4.28125, + "learning_rate": 0.01995278922047491, + "loss": 3.4905, + "mean_token_accuracy": 0.33351367712020874, + "num_tokens": 583608815.0, + "step": 1141 + }, + { + "epoch": 0.30881557598702003, + "grad_norm": 2.796875, + "learning_rate": 0.019952627820012208, + "loss": 3.3808, + "mean_token_accuracy": 0.3681255578994751, + "num_tokens": 584133028.0, + "step": 1142 + }, + { + "epoch": 0.3090859924283396, + "grad_norm": 2.90625, + "learning_rate": 0.019952466144856552, + "loss": 3.3004, + "mean_token_accuracy": 0.36051440238952637, + "num_tokens": 584657194.0, + "step": 1143 + }, + { + "epoch": 0.30935640886965926, + "grad_norm": 2.671875, + "learning_rate": 0.01995230419501291, + "loss": 3.3848, + "mean_token_accuracy": 0.38719671964645386, + "num_tokens": 585181389.0, + "step": 1144 + }, + { + "epoch": 0.3096268253109789, + "grad_norm": 3.5, + "learning_rate": 0.019952141970486243, + "loss": 3.537, + "mean_token_accuracy": 0.36735063791275024, + "num_tokens": 585705660.0, + "step": 1145 + }, + { + "epoch": 0.30989724175229855, + "grad_norm": 2.859375, + "learning_rate": 0.01995197947128154, + "loss": 3.3812, + "mean_token_accuracy": 0.3797033131122589, + "num_tokens": 586229864.0, + "step": 1146 + }, + { + "epoch": 0.3101676581936182, + "grad_norm": 3.09375, + "learning_rate": 0.019951816697403774, + "loss": 3.2082, + "mean_token_accuracy": 0.3676885962486267, + "num_tokens": 586754005.0, + "step": 1147 + }, + { + "epoch": 0.3104380746349378, + "grad_norm": 2.796875, + "learning_rate": 0.01995165364885795, + "loss": 3.3885, + "mean_token_accuracy": 0.3720381259918213, + "num_tokens": 587257658.0, + "step": 1148 + }, + { + "epoch": 0.3107084910762574, + "grad_norm": 2.484375, + "learning_rate": 0.019951490325649072, + "loss": 3.1324, + "mean_token_accuracy": 0.3841356635093689, + "num_tokens": 587781903.0, + "step": 1149 + }, + { + "epoch": 0.31097890751757706, + "grad_norm": 2.828125, + "learning_rate": 0.01995132672778214, + "loss": 3.1968, + "mean_token_accuracy": 0.3961133360862732, + "num_tokens": 588288828.0, + "step": 1150 + }, + { + "epoch": 0.3112493239588967, + "grad_norm": 68.5, + "learning_rate": 0.01995116285526218, + "loss": 11.0543, + "mean_token_accuracy": 0.005005113780498505, + "num_tokens": 588761381.0, + "step": 1151 + }, + { + "epoch": 0.31151974040021635, + "grad_norm": 7.03125, + "learning_rate": 0.01995099870809422, + "loss": 3.8773, + "mean_token_accuracy": 0.3134500980377197, + "num_tokens": 589270065.0, + "step": 1152 + }, + { + "epoch": 0.311790156841536, + "grad_norm": 2.046875, + "learning_rate": 0.0199508342862833, + "loss": 3.4264, + "mean_token_accuracy": 0.3570270240306854, + "num_tokens": 589794327.0, + "step": 1153 + }, + { + "epoch": 0.3120605732828556, + "grad_norm": 2.875, + "learning_rate": 0.019950669589834456, + "loss": 3.6551, + "mean_token_accuracy": 0.3739541172981262, + "num_tokens": 590257826.0, + "step": 1154 + }, + { + "epoch": 0.3123309897241752, + "grad_norm": 3.46875, + "learning_rate": 0.019950504618752748, + "loss": 3.5939, + "mean_token_accuracy": 0.34960415959358215, + "num_tokens": 590781895.0, + "step": 1155 + }, + { + "epoch": 0.31260140616549487, + "grad_norm": 3.265625, + "learning_rate": 0.01995033937304324, + "loss": 3.2897, + "mean_token_accuracy": 0.370268851518631, + "num_tokens": 591306052.0, + "step": 1156 + }, + { + "epoch": 0.3128718226068145, + "grad_norm": 3.421875, + "learning_rate": 0.019950173852710997, + "loss": 3.4661, + "mean_token_accuracy": 0.379529744386673, + "num_tokens": 591796435.0, + "step": 1157 + }, + { + "epoch": 0.31314223904813415, + "grad_norm": 2.859375, + "learning_rate": 0.0199500080577611, + "loss": 3.4335, + "mean_token_accuracy": 0.34843170642852783, + "num_tokens": 592320698.0, + "step": 1158 + }, + { + "epoch": 0.31341265548945374, + "grad_norm": 2.84375, + "learning_rate": 0.019949841988198637, + "loss": 3.5651, + "mean_token_accuracy": 0.36868512630462646, + "num_tokens": 592844958.0, + "step": 1159 + }, + { + "epoch": 0.3136830719307734, + "grad_norm": 2.78125, + "learning_rate": 0.0199496756440287, + "loss": 3.0875, + "mean_token_accuracy": 0.36901581287384033, + "num_tokens": 593369182.0, + "step": 1160 + }, + { + "epoch": 0.313953488372093, + "grad_norm": 2.34375, + "learning_rate": 0.019949509025256395, + "loss": 3.4488, + "mean_token_accuracy": 0.36053547263145447, + "num_tokens": 593893338.0, + "step": 1161 + }, + { + "epoch": 0.31422390481341267, + "grad_norm": 2.828125, + "learning_rate": 0.019949342131886833, + "loss": 3.2114, + "mean_token_accuracy": 0.38287675380706787, + "num_tokens": 594403184.0, + "step": 1162 + }, + { + "epoch": 0.3144943212547323, + "grad_norm": 2.625, + "learning_rate": 0.019949174963925138, + "loss": 3.5641, + "mean_token_accuracy": 0.35272741317749023, + "num_tokens": 594927332.0, + "step": 1163 + }, + { + "epoch": 0.3147647376960519, + "grad_norm": 2.671875, + "learning_rate": 0.019949007521376436, + "loss": 3.1362, + "mean_token_accuracy": 0.3666996657848358, + "num_tokens": 595451423.0, + "step": 1164 + }, + { + "epoch": 0.31503515413737154, + "grad_norm": 2.734375, + "learning_rate": 0.019948839804245867, + "loss": 3.2978, + "mean_token_accuracy": 0.3950335383415222, + "num_tokens": 595924911.0, + "step": 1165 + }, + { + "epoch": 0.3153055705786912, + "grad_norm": 3.265625, + "learning_rate": 0.019948671812538576, + "loss": 3.4861, + "mean_token_accuracy": 0.34104639291763306, + "num_tokens": 596449032.0, + "step": 1166 + }, + { + "epoch": 0.31557598702001083, + "grad_norm": 2.40625, + "learning_rate": 0.019948503546259717, + "loss": 3.2343, + "mean_token_accuracy": 0.38205552101135254, + "num_tokens": 596965109.0, + "step": 1167 + }, + { + "epoch": 0.31584640346133047, + "grad_norm": 2.734375, + "learning_rate": 0.019948335005414455, + "loss": 3.1411, + "mean_token_accuracy": 0.3836778700351715, + "num_tokens": 597454273.0, + "step": 1168 + }, + { + "epoch": 0.31611681990265006, + "grad_norm": 2.6875, + "learning_rate": 0.019948166190007954, + "loss": 3.4805, + "mean_token_accuracy": 0.37148958444595337, + "num_tokens": 597945557.0, + "step": 1169 + }, + { + "epoch": 0.3163872363439697, + "grad_norm": 3.828125, + "learning_rate": 0.019947997100045405, + "loss": 3.6818, + "mean_token_accuracy": 0.3468678593635559, + "num_tokens": 598469830.0, + "step": 1170 + }, + { + "epoch": 0.31665765278528935, + "grad_norm": 418.0, + "learning_rate": 0.01994782773553199, + "loss": 17.2661, + "mean_token_accuracy": 0.006544564850628376, + "num_tokens": 598994106.0, + "step": 1171 + }, + { + "epoch": 0.316928069226609, + "grad_norm": 8.75, + "learning_rate": 0.019947658096472906, + "loss": 4.4096, + "mean_token_accuracy": 0.28552788496017456, + "num_tokens": 599518346.0, + "step": 1172 + }, + { + "epoch": 0.31719848566792863, + "grad_norm": 3.890625, + "learning_rate": 0.019947488182873358, + "loss": 3.7721, + "mean_token_accuracy": 0.34598487615585327, + "num_tokens": 600015425.0, + "step": 1173 + }, + { + "epoch": 0.3174689021092482, + "grad_norm": 5.15625, + "learning_rate": 0.019947317994738558, + "loss": 3.745, + "mean_token_accuracy": 0.32188189029693604, + "num_tokens": 600539577.0, + "step": 1174 + }, + { + "epoch": 0.31773931855056786, + "grad_norm": 3.40625, + "learning_rate": 0.01994714753207373, + "loss": 3.536, + "mean_token_accuracy": 0.3585498332977295, + "num_tokens": 601063794.0, + "step": 1175 + }, + { + "epoch": 0.3180097349918875, + "grad_norm": 3.921875, + "learning_rate": 0.019946976794884105, + "loss": 3.486, + "mean_token_accuracy": 0.37774211168289185, + "num_tokens": 601536111.0, + "step": 1176 + }, + { + "epoch": 0.31828015143320715, + "grad_norm": 3.59375, + "learning_rate": 0.019946805783174922, + "loss": 3.7154, + "mean_token_accuracy": 0.34262028336524963, + "num_tokens": 602060357.0, + "step": 1177 + }, + { + "epoch": 0.3185505678745268, + "grad_norm": 3.546875, + "learning_rate": 0.019946634496951426, + "loss": 3.2827, + "mean_token_accuracy": 0.4030107259750366, + "num_tokens": 602584608.0, + "step": 1178 + }, + { + "epoch": 0.3188209843158464, + "grad_norm": 2.1875, + "learning_rate": 0.01994646293621887, + "loss": 3.4012, + "mean_token_accuracy": 0.38689303398132324, + "num_tokens": 603059420.0, + "step": 1179 + }, + { + "epoch": 0.319091400757166, + "grad_norm": 4.90625, + "learning_rate": 0.019946291100982526, + "loss": 3.5062, + "mean_token_accuracy": 0.3388240337371826, + "num_tokens": 603583699.0, + "step": 1180 + }, + { + "epoch": 0.31936181719848566, + "grad_norm": 3.453125, + "learning_rate": 0.019946118991247656, + "loss": 3.3756, + "mean_token_accuracy": 0.3801005780696869, + "num_tokens": 604065182.0, + "step": 1181 + }, + { + "epoch": 0.3196322336398053, + "grad_norm": 3.890625, + "learning_rate": 0.019945946607019555, + "loss": 3.4993, + "mean_token_accuracy": 0.3638448715209961, + "num_tokens": 604530011.0, + "step": 1182 + }, + { + "epoch": 0.31990265008112495, + "grad_norm": 3.1875, + "learning_rate": 0.019945773948303495, + "loss": 3.3712, + "mean_token_accuracy": 0.37941575050354004, + "num_tokens": 605054208.0, + "step": 1183 + }, + { + "epoch": 0.32017306652244454, + "grad_norm": 3.234375, + "learning_rate": 0.01994560101510479, + "loss": 3.4267, + "mean_token_accuracy": 0.3338669538497925, + "num_tokens": 605578355.0, + "step": 1184 + }, + { + "epoch": 0.3204434829637642, + "grad_norm": 2.828125, + "learning_rate": 0.01994542780742873, + "loss": 3.4613, + "mean_token_accuracy": 0.3791651129722595, + "num_tokens": 606053571.0, + "step": 1185 + }, + { + "epoch": 0.3207138994050838, + "grad_norm": 3.015625, + "learning_rate": 0.019945254325280645, + "loss": 3.4639, + "mean_token_accuracy": 0.37067848443984985, + "num_tokens": 606518698.0, + "step": 1186 + }, + { + "epoch": 0.32098431584640347, + "grad_norm": 3.65625, + "learning_rate": 0.019945080568665847, + "loss": 3.3641, + "mean_token_accuracy": 0.371471107006073, + "num_tokens": 607042899.0, + "step": 1187 + }, + { + "epoch": 0.3212547322877231, + "grad_norm": 4.25, + "learning_rate": 0.019944906537589676, + "loss": 3.5273, + "mean_token_accuracy": 0.35760605335235596, + "num_tokens": 607567074.0, + "step": 1188 + }, + { + "epoch": 0.32152514872904275, + "grad_norm": 3.21875, + "learning_rate": 0.019944732232057467, + "loss": 3.6305, + "mean_token_accuracy": 0.3535503149032593, + "num_tokens": 608091345.0, + "step": 1189 + }, + { + "epoch": 0.32179556517036234, + "grad_norm": 3.359375, + "learning_rate": 0.019944557652074563, + "loss": 3.4189, + "mean_token_accuracy": 0.3573848009109497, + "num_tokens": 608615613.0, + "step": 1190 + }, + { + "epoch": 0.322065981611682, + "grad_norm": 28.875, + "learning_rate": 0.019944382797646328, + "loss": 10.2804, + "mean_token_accuracy": 0.028046388179063797, + "num_tokens": 609118958.0, + "step": 1191 + }, + { + "epoch": 0.3223363980530016, + "grad_norm": 5.46875, + "learning_rate": 0.01994420766877813, + "loss": 4.0219, + "mean_token_accuracy": 0.3489723205566406, + "num_tokens": 609544233.0, + "step": 1192 + }, + { + "epoch": 0.32260681449432127, + "grad_norm": 2.421875, + "learning_rate": 0.01994403226547533, + "loss": 3.6365, + "mean_token_accuracy": 0.3596615791320801, + "num_tokens": 610035559.0, + "step": 1193 + }, + { + "epoch": 0.3228772309356409, + "grad_norm": 3.453125, + "learning_rate": 0.01994385658774332, + "loss": 3.5471, + "mean_token_accuracy": 0.3426942229270935, + "num_tokens": 610559840.0, + "step": 1194 + }, + { + "epoch": 0.3231476473769605, + "grad_norm": 2.78125, + "learning_rate": 0.019943680635587487, + "loss": 3.1828, + "mean_token_accuracy": 0.3768848478794098, + "num_tokens": 611051574.0, + "step": 1195 + }, + { + "epoch": 0.32341806381828014, + "grad_norm": 2.671875, + "learning_rate": 0.019943504409013234, + "loss": 3.4963, + "mean_token_accuracy": 0.3433520197868347, + "num_tokens": 611575778.0, + "step": 1196 + }, + { + "epoch": 0.3236884802595998, + "grad_norm": 4.0, + "learning_rate": 0.019943327908025964, + "loss": 3.6497, + "mean_token_accuracy": 0.35413897037506104, + "num_tokens": 612100057.0, + "step": 1197 + }, + { + "epoch": 0.32395889670091943, + "grad_norm": 3.546875, + "learning_rate": 0.01994315113263109, + "loss": 3.6117, + "mean_token_accuracy": 0.3390932083129883, + "num_tokens": 612624323.0, + "step": 1198 + }, + { + "epoch": 0.3242293131422391, + "grad_norm": 3.203125, + "learning_rate": 0.019942974082834044, + "loss": 3.5849, + "mean_token_accuracy": 0.37006625533103943, + "num_tokens": 613148525.0, + "step": 1199 + }, + { + "epoch": 0.32449972958355866, + "grad_norm": 4.0, + "learning_rate": 0.019942796758640247, + "loss": 3.4407, + "mean_token_accuracy": 0.34982800483703613, + "num_tokens": 613672728.0, + "step": 1200 + }, + { + "epoch": 0.3247701460248783, + "grad_norm": 2.484375, + "learning_rate": 0.01994261916005515, + "loss": 3.3626, + "mean_token_accuracy": 0.37280911207199097, + "num_tokens": 614195575.0, + "step": 1201 + }, + { + "epoch": 0.32504056246619795, + "grad_norm": 3.234375, + "learning_rate": 0.0199424412870842, + "loss": 3.6059, + "mean_token_accuracy": 0.3595869541168213, + "num_tokens": 614719622.0, + "step": 1202 + }, + { + "epoch": 0.3253109789075176, + "grad_norm": 2.875, + "learning_rate": 0.019942263139732852, + "loss": 3.4928, + "mean_token_accuracy": 0.35717469453811646, + "num_tokens": 615243904.0, + "step": 1203 + }, + { + "epoch": 0.32558139534883723, + "grad_norm": 3.359375, + "learning_rate": 0.01994208471800657, + "loss": 3.6031, + "mean_token_accuracy": 0.36702960729599, + "num_tokens": 615768119.0, + "step": 1204 + }, + { + "epoch": 0.3258518117901568, + "grad_norm": 3.5, + "learning_rate": 0.01994190602191084, + "loss": 3.3551, + "mean_token_accuracy": 0.3625718355178833, + "num_tokens": 616292219.0, + "step": 1205 + }, + { + "epoch": 0.32612222823147646, + "grad_norm": 2.546875, + "learning_rate": 0.019941727051451127, + "loss": 3.4396, + "mean_token_accuracy": 0.3677324652671814, + "num_tokens": 616816489.0, + "step": 1206 + }, + { + "epoch": 0.3263926446727961, + "grad_norm": 7.5625, + "learning_rate": 0.019941547806632938, + "loss": 3.0864, + "mean_token_accuracy": 0.3918267488479614, + "num_tokens": 617340602.0, + "step": 1207 + }, + { + "epoch": 0.32666306111411575, + "grad_norm": 2.015625, + "learning_rate": 0.019941368287461764, + "loss": 3.2799, + "mean_token_accuracy": 0.38073503971099854, + "num_tokens": 617864862.0, + "step": 1208 + }, + { + "epoch": 0.3269334775554354, + "grad_norm": 2.71875, + "learning_rate": 0.019941188493943118, + "loss": 3.4317, + "mean_token_accuracy": 0.36097967624664307, + "num_tokens": 618388915.0, + "step": 1209 + }, + { + "epoch": 0.327203893996755, + "grad_norm": 4.21875, + "learning_rate": 0.019941008426082516, + "loss": 3.4801, + "mean_token_accuracy": 0.35802745819091797, + "num_tokens": 618913043.0, + "step": 1210 + }, + { + "epoch": 0.3274743104380746, + "grad_norm": 28.0, + "learning_rate": 0.01994082808388548, + "loss": 11.4914, + "mean_token_accuracy": 0.0094792814925313, + "num_tokens": 619437294.0, + "step": 1211 + }, + { + "epoch": 0.32774472687939427, + "grad_norm": 5.1875, + "learning_rate": 0.019940647467357547, + "loss": 3.9114, + "mean_token_accuracy": 0.3138602674007416, + "num_tokens": 619961517.0, + "step": 1212 + }, + { + "epoch": 0.3280151433207139, + "grad_norm": 2.375, + "learning_rate": 0.019940466576504254, + "loss": 3.3976, + "mean_token_accuracy": 0.33439651131629944, + "num_tokens": 620485760.0, + "step": 1213 + }, + { + "epoch": 0.32828555976203355, + "grad_norm": 3.3125, + "learning_rate": 0.019940285411331155, + "loss": 3.7476, + "mean_token_accuracy": 0.33958956599235535, + "num_tokens": 621009949.0, + "step": 1214 + }, + { + "epoch": 0.32855597620335314, + "grad_norm": 3.328125, + "learning_rate": 0.01994010397184381, + "loss": 3.6353, + "mean_token_accuracy": 0.3495219647884369, + "num_tokens": 621497362.0, + "step": 1215 + }, + { + "epoch": 0.3288263926446728, + "grad_norm": 2.921875, + "learning_rate": 0.019939922258047783, + "loss": 3.3698, + "mean_token_accuracy": 0.35825344920158386, + "num_tokens": 621971637.0, + "step": 1216 + }, + { + "epoch": 0.3290968090859924, + "grad_norm": 3.65625, + "learning_rate": 0.01993974026994865, + "loss": 3.4215, + "mean_token_accuracy": 0.34675461053848267, + "num_tokens": 622472995.0, + "step": 1217 + }, + { + "epoch": 0.32936722552731207, + "grad_norm": 2.796875, + "learning_rate": 0.019939558007551997, + "loss": 3.5262, + "mean_token_accuracy": 0.3625323474407196, + "num_tokens": 622997154.0, + "step": 1218 + }, + { + "epoch": 0.3296376419686317, + "grad_norm": 3.90625, + "learning_rate": 0.019939375470863413, + "loss": 3.6018, + "mean_token_accuracy": 0.348671555519104, + "num_tokens": 623521356.0, + "step": 1219 + }, + { + "epoch": 0.3299080584099513, + "grad_norm": 2.703125, + "learning_rate": 0.0199391926598885, + "loss": 3.3184, + "mean_token_accuracy": 0.38815540075302124, + "num_tokens": 624015242.0, + "step": 1220 + }, + { + "epoch": 0.33017847485127094, + "grad_norm": 2.734375, + "learning_rate": 0.019939009574632873, + "loss": 3.2872, + "mean_token_accuracy": 0.36901211738586426, + "num_tokens": 624539433.0, + "step": 1221 + }, + { + "epoch": 0.3304488912925906, + "grad_norm": 2.515625, + "learning_rate": 0.019938826215102137, + "loss": 3.4759, + "mean_token_accuracy": 0.35865092277526855, + "num_tokens": 625053763.0, + "step": 1222 + }, + { + "epoch": 0.33071930773391023, + "grad_norm": 3.078125, + "learning_rate": 0.01993864258130193, + "loss": 3.4712, + "mean_token_accuracy": 0.35761141777038574, + "num_tokens": 625577849.0, + "step": 1223 + }, + { + "epoch": 0.33098972417522987, + "grad_norm": 3.34375, + "learning_rate": 0.019938458673237878, + "loss": 3.532, + "mean_token_accuracy": 0.3680582046508789, + "num_tokens": 626101970.0, + "step": 1224 + }, + { + "epoch": 0.33126014061654946, + "grad_norm": 3.109375, + "learning_rate": 0.01993827449091563, + "loss": 3.2974, + "mean_token_accuracy": 0.3894311189651489, + "num_tokens": 626626102.0, + "step": 1225 + }, + { + "epoch": 0.3315305570578691, + "grad_norm": 3.0625, + "learning_rate": 0.019938090034340834, + "loss": 3.4438, + "mean_token_accuracy": 0.34154796600341797, + "num_tokens": 627150226.0, + "step": 1226 + }, + { + "epoch": 0.33180097349918874, + "grad_norm": 3.5, + "learning_rate": 0.019937905303519153, + "loss": 3.523, + "mean_token_accuracy": 0.34528928995132446, + "num_tokens": 627633413.0, + "step": 1227 + }, + { + "epoch": 0.3320713899405084, + "grad_norm": 3.421875, + "learning_rate": 0.01993772029845625, + "loss": 3.1867, + "mean_token_accuracy": 0.39126336574554443, + "num_tokens": 628106399.0, + "step": 1228 + }, + { + "epoch": 0.33234180638182803, + "grad_norm": 2.890625, + "learning_rate": 0.019937535019157803, + "loss": 3.4446, + "mean_token_accuracy": 0.36885327100753784, + "num_tokens": 628630669.0, + "step": 1229 + }, + { + "epoch": 0.3326122228231477, + "grad_norm": 3.515625, + "learning_rate": 0.0199373494656295, + "loss": 3.5108, + "mean_token_accuracy": 0.3697383999824524, + "num_tokens": 629113761.0, + "step": 1230 + }, + { + "epoch": 0.33288263926446726, + "grad_norm": 7.84375, + "learning_rate": 0.019937163637877033, + "loss": 10.6775, + "mean_token_accuracy": 8.450591121800244e-05, + "num_tokens": 629637973.0, + "step": 1231 + }, + { + "epoch": 0.3331530557057869, + "grad_norm": 6.375, + "learning_rate": 0.019936977535906102, + "loss": 3.8881, + "mean_token_accuracy": 0.3310481309890747, + "num_tokens": 630162240.0, + "step": 1232 + }, + { + "epoch": 0.33342347214710655, + "grad_norm": 2.546875, + "learning_rate": 0.01993679115972242, + "loss": 3.5237, + "mean_token_accuracy": 0.3491247296333313, + "num_tokens": 630686343.0, + "step": 1233 + }, + { + "epoch": 0.3336938885884262, + "grad_norm": 3.546875, + "learning_rate": 0.019936604509331702, + "loss": 3.4099, + "mean_token_accuracy": 0.36148759722709656, + "num_tokens": 631210580.0, + "step": 1234 + }, + { + "epoch": 0.33396430502974583, + "grad_norm": 68.5, + "learning_rate": 0.019936417584739678, + "loss": 4.3257, + "mean_token_accuracy": 0.34771353006362915, + "num_tokens": 631663426.0, + "step": 1235 + }, + { + "epoch": 0.3342347214710654, + "grad_norm": 4.875, + "learning_rate": 0.019936230385952082, + "loss": 3.8049, + "mean_token_accuracy": 0.3039402961730957, + "num_tokens": 632187664.0, + "step": 1236 + }, + { + "epoch": 0.33450513791238506, + "grad_norm": 2.09375, + "learning_rate": 0.01993604291297466, + "loss": 3.6028, + "mean_token_accuracy": 0.34608402848243713, + "num_tokens": 632666638.0, + "step": 1237 + }, + { + "epoch": 0.3347755543537047, + "grad_norm": 3.84375, + "learning_rate": 0.019935855165813157, + "loss": 3.5094, + "mean_token_accuracy": 0.36278480291366577, + "num_tokens": 633190866.0, + "step": 1238 + }, + { + "epoch": 0.33504597079502435, + "grad_norm": 4.28125, + "learning_rate": 0.019935667144473344, + "loss": 3.8821, + "mean_token_accuracy": 0.3295637369155884, + "num_tokens": 633715111.0, + "step": 1239 + }, + { + "epoch": 0.335316387236344, + "grad_norm": 3.328125, + "learning_rate": 0.01993547884896098, + "loss": 3.6233, + "mean_token_accuracy": 0.33602917194366455, + "num_tokens": 634239251.0, + "step": 1240 + }, + { + "epoch": 0.3355868036776636, + "grad_norm": 3.296875, + "learning_rate": 0.01993529027928185, + "loss": 3.7993, + "mean_token_accuracy": 0.3641049861907959, + "num_tokens": 634679609.0, + "step": 1241 + }, + { + "epoch": 0.3358572201189832, + "grad_norm": 2.484375, + "learning_rate": 0.01993510143544174, + "loss": 3.4676, + "mean_token_accuracy": 0.37576180696487427, + "num_tokens": 635186429.0, + "step": 1242 + }, + { + "epoch": 0.33612763656030287, + "grad_norm": 2.796875, + "learning_rate": 0.019934912317446436, + "loss": 3.3133, + "mean_token_accuracy": 0.3826521933078766, + "num_tokens": 635664792.0, + "step": 1243 + }, + { + "epoch": 0.3363980530016225, + "grad_norm": 2.65625, + "learning_rate": 0.01993472292530175, + "loss": 3.3278, + "mean_token_accuracy": 0.37353530526161194, + "num_tokens": 636188978.0, + "step": 1244 + }, + { + "epoch": 0.33666846944294215, + "grad_norm": 2.78125, + "learning_rate": 0.01993453325901349, + "loss": 3.5963, + "mean_token_accuracy": 0.36782827973365784, + "num_tokens": 636713058.0, + "step": 1245 + }, + { + "epoch": 0.33693888588426174, + "grad_norm": 2.828125, + "learning_rate": 0.019934343318587473, + "loss": 3.2821, + "mean_token_accuracy": 0.3953056335449219, + "num_tokens": 637237242.0, + "step": 1246 + }, + { + "epoch": 0.3372093023255814, + "grad_norm": 3.796875, + "learning_rate": 0.019934153104029528, + "loss": 3.5617, + "mean_token_accuracy": 0.3764355778694153, + "num_tokens": 637733143.0, + "step": 1247 + }, + { + "epoch": 0.337479718766901, + "grad_norm": 3.421875, + "learning_rate": 0.019933962615345493, + "loss": 3.4001, + "mean_token_accuracy": 0.3451356291770935, + "num_tokens": 638257289.0, + "step": 1248 + }, + { + "epoch": 0.33775013520822067, + "grad_norm": 2.671875, + "learning_rate": 0.01993377185254121, + "loss": 3.4637, + "mean_token_accuracy": 0.36271554231643677, + "num_tokens": 638781531.0, + "step": 1249 + }, + { + "epoch": 0.3380205516495403, + "grad_norm": 3.015625, + "learning_rate": 0.019933580815622538, + "loss": 3.3051, + "mean_token_accuracy": 0.37625253200531006, + "num_tokens": 639305809.0, + "step": 1250 + }, + { + "epoch": 0.3382909680908599, + "grad_norm": 76.5, + "learning_rate": 0.01993338950459533, + "loss": 18.5551, + "mean_token_accuracy": 0.00012749270536005497, + "num_tokens": 639829997.0, + "step": 1251 + }, + { + "epoch": 0.33856138453217954, + "grad_norm": 5.78125, + "learning_rate": 0.019933197919465465, + "loss": 3.701, + "mean_token_accuracy": 0.3174591064453125, + "num_tokens": 640354266.0, + "step": 1252 + }, + { + "epoch": 0.3388318009734992, + "grad_norm": 1.984375, + "learning_rate": 0.019933006060238817, + "loss": 3.4978, + "mean_token_accuracy": 0.3646790683269501, + "num_tokens": 640878471.0, + "step": 1253 + }, + { + "epoch": 0.33910221741481883, + "grad_norm": 3.015625, + "learning_rate": 0.019932813926921273, + "loss": 3.614, + "mean_token_accuracy": 0.37083131074905396, + "num_tokens": 641340917.0, + "step": 1254 + }, + { + "epoch": 0.3393726338561385, + "grad_norm": 2.875, + "learning_rate": 0.019932621519518724, + "loss": 3.5289, + "mean_token_accuracy": 0.3691553473472595, + "num_tokens": 641865004.0, + "step": 1255 + }, + { + "epoch": 0.33964305029745806, + "grad_norm": 4.15625, + "learning_rate": 0.01993242883803708, + "loss": 3.5536, + "mean_token_accuracy": 0.34922319650650024, + "num_tokens": 642389246.0, + "step": 1256 + }, + { + "epoch": 0.3399134667387777, + "grad_norm": 4.375, + "learning_rate": 0.01993223588248225, + "loss": 3.4343, + "mean_token_accuracy": 0.389214426279068, + "num_tokens": 642913513.0, + "step": 1257 + }, + { + "epoch": 0.34018388318009735, + "grad_norm": 3.15625, + "learning_rate": 0.019932042652860155, + "loss": 3.3171, + "mean_token_accuracy": 0.3562287986278534, + "num_tokens": 643437561.0, + "step": 1258 + }, + { + "epoch": 0.340454299621417, + "grad_norm": 3.53125, + "learning_rate": 0.019931849149176725, + "loss": 3.3819, + "mean_token_accuracy": 0.36571985483169556, + "num_tokens": 643933593.0, + "step": 1259 + }, + { + "epoch": 0.34072471606273663, + "grad_norm": 3.3125, + "learning_rate": 0.019931655371437897, + "loss": 3.5021, + "mean_token_accuracy": 0.3643417954444885, + "num_tokens": 644434696.0, + "step": 1260 + }, + { + "epoch": 0.3409951325040562, + "grad_norm": 3.796875, + "learning_rate": 0.019931461319649617, + "loss": 3.5367, + "mean_token_accuracy": 0.3617716431617737, + "num_tokens": 644958937.0, + "step": 1261 + }, + { + "epoch": 0.34126554894537586, + "grad_norm": 3.046875, + "learning_rate": 0.019931266993817833, + "loss": 3.4315, + "mean_token_accuracy": 0.3836081326007843, + "num_tokens": 645483170.0, + "step": 1262 + }, + { + "epoch": 0.3415359653866955, + "grad_norm": 3.625, + "learning_rate": 0.019931072393948514, + "loss": 3.599, + "mean_token_accuracy": 0.3784027695655823, + "num_tokens": 646007347.0, + "step": 1263 + }, + { + "epoch": 0.34180638182801515, + "grad_norm": 3.6875, + "learning_rate": 0.019930877520047633, + "loss": 3.4859, + "mean_token_accuracy": 0.34861356019973755, + "num_tokens": 646531620.0, + "step": 1264 + }, + { + "epoch": 0.3420767982693348, + "grad_norm": 2.671875, + "learning_rate": 0.019930682372121162, + "loss": 3.3808, + "mean_token_accuracy": 0.375606894493103, + "num_tokens": 647055824.0, + "step": 1265 + }, + { + "epoch": 0.34234721471065443, + "grad_norm": 3.09375, + "learning_rate": 0.019930486950175092, + "loss": 3.4995, + "mean_token_accuracy": 0.3688426613807678, + "num_tokens": 647479743.0, + "step": 1266 + }, + { + "epoch": 0.342617631151974, + "grad_norm": 2.5625, + "learning_rate": 0.019930291254215422, + "loss": 3.4215, + "mean_token_accuracy": 0.369623064994812, + "num_tokens": 648004016.0, + "step": 1267 + }, + { + "epoch": 0.34288804759329367, + "grad_norm": 2.625, + "learning_rate": 0.019930095284248155, + "loss": 3.455, + "mean_token_accuracy": 0.355191171169281, + "num_tokens": 648528265.0, + "step": 1268 + }, + { + "epoch": 0.3431584640346133, + "grad_norm": 3.140625, + "learning_rate": 0.0199298990402793, + "loss": 3.4678, + "mean_token_accuracy": 0.36558419466018677, + "num_tokens": 649013393.0, + "step": 1269 + }, + { + "epoch": 0.34342888047593295, + "grad_norm": 3.421875, + "learning_rate": 0.019929702522314886, + "loss": 3.4994, + "mean_token_accuracy": 0.35924315452575684, + "num_tokens": 649537673.0, + "step": 1270 + }, + { + "epoch": 0.3436992969172526, + "grad_norm": 82.0, + "learning_rate": 0.01992950573036093, + "loss": 10.4361, + "mean_token_accuracy": 0.015707671642303467, + "num_tokens": 650058285.0, + "step": 1271 + }, + { + "epoch": 0.3439697133585722, + "grad_norm": 7.21875, + "learning_rate": 0.019929308664423485, + "loss": 3.6703, + "mean_token_accuracy": 0.33366650342941284, + "num_tokens": 650536320.0, + "step": 1272 + }, + { + "epoch": 0.3442401297998918, + "grad_norm": 2.65625, + "learning_rate": 0.01992911132450859, + "loss": 3.428, + "mean_token_accuracy": 0.3577038049697876, + "num_tokens": 651060538.0, + "step": 1273 + }, + { + "epoch": 0.34451054624121147, + "grad_norm": 2.671875, + "learning_rate": 0.0199289137106223, + "loss": 3.6171, + "mean_token_accuracy": 0.358428418636322, + "num_tokens": 651555477.0, + "step": 1274 + }, + { + "epoch": 0.3447809626825311, + "grad_norm": 2.640625, + "learning_rate": 0.019928715822770678, + "loss": 3.26, + "mean_token_accuracy": 0.40489599108695984, + "num_tokens": 652079655.0, + "step": 1275 + }, + { + "epoch": 0.34505137912385075, + "grad_norm": 2.28125, + "learning_rate": 0.019928517660959797, + "loss": 3.4045, + "mean_token_accuracy": 0.36628174781799316, + "num_tokens": 652588748.0, + "step": 1276 + }, + { + "epoch": 0.34532179556517034, + "grad_norm": 2.578125, + "learning_rate": 0.019928319225195737, + "loss": 3.3531, + "mean_token_accuracy": 0.35792192816734314, + "num_tokens": 653112923.0, + "step": 1277 + }, + { + "epoch": 0.34559221200649, + "grad_norm": 2.8125, + "learning_rate": 0.01992812051548459, + "loss": 3.2035, + "mean_token_accuracy": 0.38243359327316284, + "num_tokens": 653637182.0, + "step": 1278 + }, + { + "epoch": 0.34586262844780963, + "grad_norm": 3.109375, + "learning_rate": 0.019927921531832444, + "loss": 3.375, + "mean_token_accuracy": 0.3802313804626465, + "num_tokens": 654161292.0, + "step": 1279 + }, + { + "epoch": 0.34613304488912927, + "grad_norm": 4.03125, + "learning_rate": 0.019927722274245413, + "loss": 3.6512, + "mean_token_accuracy": 0.34221649169921875, + "num_tokens": 654685492.0, + "step": 1280 + }, + { + "epoch": 0.3464034613304489, + "grad_norm": 3.375, + "learning_rate": 0.01992752274272961, + "loss": 3.5652, + "mean_token_accuracy": 0.3522915840148926, + "num_tokens": 655209753.0, + "step": 1281 + }, + { + "epoch": 0.3466738777717685, + "grad_norm": 2.171875, + "learning_rate": 0.01992732293729115, + "loss": 3.187, + "mean_token_accuracy": 0.37561628222465515, + "num_tokens": 655733980.0, + "step": 1282 + }, + { + "epoch": 0.34694429421308814, + "grad_norm": 2.046875, + "learning_rate": 0.019927122857936175, + "loss": 3.3641, + "mean_token_accuracy": 0.3826271891593933, + "num_tokens": 656244261.0, + "step": 1283 + }, + { + "epoch": 0.3472147106544078, + "grad_norm": 3.171875, + "learning_rate": 0.019926922504670815, + "loss": 3.5989, + "mean_token_accuracy": 0.3341633677482605, + "num_tokens": 656768467.0, + "step": 1284 + }, + { + "epoch": 0.34748512709572743, + "grad_norm": 2.4375, + "learning_rate": 0.01992672187750122, + "loss": 3.3022, + "mean_token_accuracy": 0.3738354444503784, + "num_tokens": 657242261.0, + "step": 1285 + }, + { + "epoch": 0.3477555435370471, + "grad_norm": 3.65625, + "learning_rate": 0.019926520976433545, + "loss": 3.4088, + "mean_token_accuracy": 0.3685518503189087, + "num_tokens": 657766504.0, + "step": 1286 + }, + { + "epoch": 0.34802595997836666, + "grad_norm": 2.796875, + "learning_rate": 0.019926319801473955, + "loss": 3.1583, + "mean_token_accuracy": 0.42338502407073975, + "num_tokens": 658290774.0, + "step": 1287 + }, + { + "epoch": 0.3482963764196863, + "grad_norm": 2.921875, + "learning_rate": 0.019926118352628624, + "loss": 3.4584, + "mean_token_accuracy": 0.3894202709197998, + "num_tokens": 658763006.0, + "step": 1288 + }, + { + "epoch": 0.34856679286100595, + "grad_norm": 4.375, + "learning_rate": 0.01992591662990373, + "loss": 3.7528, + "mean_token_accuracy": 0.35089561343193054, + "num_tokens": 659287210.0, + "step": 1289 + }, + { + "epoch": 0.3488372093023256, + "grad_norm": 4.34375, + "learning_rate": 0.01992571463330547, + "loss": 3.5075, + "mean_token_accuracy": 0.3921527862548828, + "num_tokens": 659746963.0, + "step": 1290 + }, + { + "epoch": 0.34910762574364523, + "grad_norm": 147.0, + "learning_rate": 0.01992551236284003, + "loss": 11.3933, + "mean_token_accuracy": 0.005710638128221035, + "num_tokens": 660270063.0, + "step": 1291 + }, + { + "epoch": 0.3493780421849648, + "grad_norm": 11.75, + "learning_rate": 0.019925309818513624, + "loss": 4.6633, + "mean_token_accuracy": 0.29155415296554565, + "num_tokens": 660794306.0, + "step": 1292 + }, + { + "epoch": 0.34964845862628446, + "grad_norm": 2.6875, + "learning_rate": 0.019925107000332468, + "loss": 3.3788, + "mean_token_accuracy": 0.3809077739715576, + "num_tokens": 661237754.0, + "step": 1293 + }, + { + "epoch": 0.3499188750676041, + "grad_norm": 3.8125, + "learning_rate": 0.01992490390830278, + "loss": 3.5157, + "mean_token_accuracy": 0.34294021129608154, + "num_tokens": 661761917.0, + "step": 1294 + }, + { + "epoch": 0.35018929150892375, + "grad_norm": 2.90625, + "learning_rate": 0.019924700542430795, + "loss": 3.488, + "mean_token_accuracy": 0.3697531521320343, + "num_tokens": 662285895.0, + "step": 1295 + }, + { + "epoch": 0.3504597079502434, + "grad_norm": 3.546875, + "learning_rate": 0.01992449690272275, + "loss": 3.4939, + "mean_token_accuracy": 0.37424182891845703, + "num_tokens": 662810162.0, + "step": 1296 + }, + { + "epoch": 0.350730124391563, + "grad_norm": 3.0, + "learning_rate": 0.019924292989184897, + "loss": 3.4381, + "mean_token_accuracy": 0.3502412736415863, + "num_tokens": 663334336.0, + "step": 1297 + }, + { + "epoch": 0.3510005408328826, + "grad_norm": 21.375, + "learning_rate": 0.01992408880182349, + "loss": 3.3182, + "mean_token_accuracy": 0.40490126609802246, + "num_tokens": 663858558.0, + "step": 1298 + }, + { + "epoch": 0.35127095727420227, + "grad_norm": 2.8125, + "learning_rate": 0.01992388434064479, + "loss": 3.4323, + "mean_token_accuracy": 0.40558674931526184, + "num_tokens": 664358573.0, + "step": 1299 + }, + { + "epoch": 0.3515413737155219, + "grad_norm": 20.375, + "learning_rate": 0.019923679605655078, + "loss": 3.7077, + "mean_token_accuracy": 0.32847341895103455, + "num_tokens": 664882711.0, + "step": 1300 + }, + { + "epoch": 0.35181179015684155, + "grad_norm": 3.296875, + "learning_rate": 0.019923474596860637, + "loss": 3.7728, + "mean_token_accuracy": 0.3173917829990387, + "num_tokens": 665406739.0, + "step": 1301 + }, + { + "epoch": 0.3520822065981612, + "grad_norm": 2.78125, + "learning_rate": 0.019923269314267748, + "loss": 3.3671, + "mean_token_accuracy": 0.3541608452796936, + "num_tokens": 665897716.0, + "step": 1302 + }, + { + "epoch": 0.3523526230394808, + "grad_norm": 4.65625, + "learning_rate": 0.019923063757882716, + "loss": 3.5399, + "mean_token_accuracy": 0.3426756262779236, + "num_tokens": 666421844.0, + "step": 1303 + }, + { + "epoch": 0.3526230394808004, + "grad_norm": 2.59375, + "learning_rate": 0.019922857927711847, + "loss": 3.4427, + "mean_token_accuracy": 0.36141741275787354, + "num_tokens": 666928586.0, + "step": 1304 + }, + { + "epoch": 0.35289345592212007, + "grad_norm": 3.671875, + "learning_rate": 0.019922651823761454, + "loss": 3.6821, + "mean_token_accuracy": 0.3596211075782776, + "num_tokens": 667452759.0, + "step": 1305 + }, + { + "epoch": 0.3531638723634397, + "grad_norm": 2.765625, + "learning_rate": 0.019922445446037867, + "loss": 3.4808, + "mean_token_accuracy": 0.3565707802772522, + "num_tokens": 667976942.0, + "step": 1306 + }, + { + "epoch": 0.35343428880475936, + "grad_norm": 2.84375, + "learning_rate": 0.019922238794547412, + "loss": 3.2923, + "mean_token_accuracy": 0.36462175846099854, + "num_tokens": 668501161.0, + "step": 1307 + }, + { + "epoch": 0.35370470524607894, + "grad_norm": 2.875, + "learning_rate": 0.019922031869296437, + "loss": 3.3887, + "mean_token_accuracy": 0.3561651110649109, + "num_tokens": 669025377.0, + "step": 1308 + }, + { + "epoch": 0.3539751216873986, + "grad_norm": 2.5625, + "learning_rate": 0.019921824670291282, + "loss": 3.1772, + "mean_token_accuracy": 0.38369113206863403, + "num_tokens": 669549597.0, + "step": 1309 + }, + { + "epoch": 0.35424553812871823, + "grad_norm": 2.65625, + "learning_rate": 0.019921617197538308, + "loss": 3.3174, + "mean_token_accuracy": 0.4048430025577545, + "num_tokens": 670073790.0, + "step": 1310 + }, + { + "epoch": 0.3545159545700379, + "grad_norm": 202.0, + "learning_rate": 0.01992140945104388, + "loss": 23.2004, + "mean_token_accuracy": 2.4634244255139492e-05, + "num_tokens": 670598076.0, + "step": 1311 + }, + { + "epoch": 0.3547863710113575, + "grad_norm": 5.9375, + "learning_rate": 0.01992120143081438, + "loss": 3.8926, + "mean_token_accuracy": 0.29522252082824707, + "num_tokens": 671122276.0, + "step": 1312 + }, + { + "epoch": 0.3550567874526771, + "grad_norm": 2.234375, + "learning_rate": 0.01992099313685618, + "loss": 3.566, + "mean_token_accuracy": 0.3624420762062073, + "num_tokens": 671618155.0, + "step": 1313 + }, + { + "epoch": 0.35532720389399675, + "grad_norm": 4.21875, + "learning_rate": 0.01992078456917568, + "loss": 3.675, + "mean_token_accuracy": 0.29540500044822693, + "num_tokens": 672121413.0, + "step": 1314 + }, + { + "epoch": 0.3555976203353164, + "grad_norm": 2.640625, + "learning_rate": 0.01992057572777927, + "loss": 3.5504, + "mean_token_accuracy": 0.3685268759727478, + "num_tokens": 672645696.0, + "step": 1315 + }, + { + "epoch": 0.35586803677663603, + "grad_norm": 3.703125, + "learning_rate": 0.019920366612673365, + "loss": 3.8946, + "mean_token_accuracy": 0.31801095604896545, + "num_tokens": 673169877.0, + "step": 1316 + }, + { + "epoch": 0.3561384532179557, + "grad_norm": 2.359375, + "learning_rate": 0.01992015722386438, + "loss": 3.3967, + "mean_token_accuracy": 0.37024322152137756, + "num_tokens": 673694020.0, + "step": 1317 + }, + { + "epoch": 0.35640886965927526, + "grad_norm": 3.71875, + "learning_rate": 0.019919947561358738, + "loss": 3.7039, + "mean_token_accuracy": 0.3389657735824585, + "num_tokens": 674218234.0, + "step": 1318 + }, + { + "epoch": 0.3566792861005949, + "grad_norm": 2.75, + "learning_rate": 0.01991973762516287, + "loss": 3.5264, + "mean_token_accuracy": 0.37837398052215576, + "num_tokens": 674736653.0, + "step": 1319 + }, + { + "epoch": 0.35694970254191455, + "grad_norm": 4.1875, + "learning_rate": 0.019919527415283224, + "loss": 3.5341, + "mean_token_accuracy": 0.36040741205215454, + "num_tokens": 675211423.0, + "step": 1320 + }, + { + "epoch": 0.3572201189832342, + "grad_norm": 2.765625, + "learning_rate": 0.019919316931726248, + "loss": 3.5699, + "mean_token_accuracy": 0.37017279863357544, + "num_tokens": 675735596.0, + "step": 1321 + }, + { + "epoch": 0.35749053542455383, + "grad_norm": 3.890625, + "learning_rate": 0.019919106174498397, + "loss": 3.4248, + "mean_token_accuracy": 0.3612845838069916, + "num_tokens": 676259787.0, + "step": 1322 + }, + { + "epoch": 0.3577609518658734, + "grad_norm": 2.640625, + "learning_rate": 0.019918895143606136, + "loss": 3.4046, + "mean_token_accuracy": 0.38085418939590454, + "num_tokens": 676723396.0, + "step": 1323 + }, + { + "epoch": 0.35803136830719307, + "grad_norm": 2.640625, + "learning_rate": 0.01991868383905595, + "loss": 3.3881, + "mean_token_accuracy": 0.37930017709732056, + "num_tokens": 677247485.0, + "step": 1324 + }, + { + "epoch": 0.3583017847485127, + "grad_norm": 2.296875, + "learning_rate": 0.01991847226085431, + "loss": 3.2682, + "mean_token_accuracy": 0.37824660539627075, + "num_tokens": 677690326.0, + "step": 1325 + }, + { + "epoch": 0.35857220118983235, + "grad_norm": 3.328125, + "learning_rate": 0.019918260409007713, + "loss": 3.2879, + "mean_token_accuracy": 0.362704873085022, + "num_tokens": 678214477.0, + "step": 1326 + }, + { + "epoch": 0.358842617631152, + "grad_norm": 3.390625, + "learning_rate": 0.019918048283522664, + "loss": 3.1192, + "mean_token_accuracy": 0.39574670791625977, + "num_tokens": 678738756.0, + "step": 1327 + }, + { + "epoch": 0.3591130340724716, + "grad_norm": 2.9375, + "learning_rate": 0.019917835884405665, + "loss": 3.28, + "mean_token_accuracy": 0.3617357015609741, + "num_tokens": 679263016.0, + "step": 1328 + }, + { + "epoch": 0.3593834505137912, + "grad_norm": 3.109375, + "learning_rate": 0.019917623211663234, + "loss": 3.3878, + "mean_token_accuracy": 0.37074708938598633, + "num_tokens": 679745129.0, + "step": 1329 + }, + { + "epoch": 0.35965386695511087, + "grad_norm": 3.265625, + "learning_rate": 0.0199174102653019, + "loss": 3.5014, + "mean_token_accuracy": 0.35582998394966125, + "num_tokens": 680248517.0, + "step": 1330 + }, + { + "epoch": 0.3599242833964305, + "grad_norm": 11.625, + "learning_rate": 0.019917197045328194, + "loss": 11.2764, + "mean_token_accuracy": 0.0007085319957695901, + "num_tokens": 680772614.0, + "step": 1331 + }, + { + "epoch": 0.36019469983775015, + "grad_norm": 9.375, + "learning_rate": 0.01991698355174866, + "loss": 4.3004, + "mean_token_accuracy": 0.30553820729255676, + "num_tokens": 681296777.0, + "step": 1332 + }, + { + "epoch": 0.36046511627906974, + "grad_norm": 2.703125, + "learning_rate": 0.019916769784569847, + "loss": 3.5066, + "mean_token_accuracy": 0.3544890880584717, + "num_tokens": 681820957.0, + "step": 1333 + }, + { + "epoch": 0.3607355327203894, + "grad_norm": 3.515625, + "learning_rate": 0.019916555743798314, + "loss": 3.5476, + "mean_token_accuracy": 0.3378731608390808, + "num_tokens": 682345140.0, + "step": 1334 + }, + { + "epoch": 0.361005949161709, + "grad_norm": 2.859375, + "learning_rate": 0.019916341429440626, + "loss": 3.4421, + "mean_token_accuracy": 0.3513287901878357, + "num_tokens": 682869333.0, + "step": 1335 + }, + { + "epoch": 0.36127636560302867, + "grad_norm": 2.984375, + "learning_rate": 0.019916126841503366, + "loss": 3.3021, + "mean_token_accuracy": 0.3764558732509613, + "num_tokens": 683289839.0, + "step": 1336 + }, + { + "epoch": 0.3615467820443483, + "grad_norm": 3.15625, + "learning_rate": 0.019915911979993112, + "loss": 3.4059, + "mean_token_accuracy": 0.37947046756744385, + "num_tokens": 683814060.0, + "step": 1337 + }, + { + "epoch": 0.3618171984856679, + "grad_norm": 2.609375, + "learning_rate": 0.019915696844916463, + "loss": 3.2768, + "mean_token_accuracy": 0.37077218294143677, + "num_tokens": 684306362.0, + "step": 1338 + }, + { + "epoch": 0.36208761492698754, + "grad_norm": 2.3125, + "learning_rate": 0.01991548143628001, + "loss": 3.6526, + "mean_token_accuracy": 0.37563085556030273, + "num_tokens": 684788956.0, + "step": 1339 + }, + { + "epoch": 0.3623580313683072, + "grad_norm": 4.46875, + "learning_rate": 0.019915265754090368, + "loss": 3.8736, + "mean_token_accuracy": 0.32424601912498474, + "num_tokens": 685313132.0, + "step": 1340 + }, + { + "epoch": 0.36262844780962683, + "grad_norm": 2.421875, + "learning_rate": 0.019915049798354154, + "loss": 3.5072, + "mean_token_accuracy": 0.36945557594299316, + "num_tokens": 685837331.0, + "step": 1341 + }, + { + "epoch": 0.3628988642509465, + "grad_norm": 3.171875, + "learning_rate": 0.019914833569078, + "loss": 3.5143, + "mean_token_accuracy": 0.3790454864501953, + "num_tokens": 686361558.0, + "step": 1342 + }, + { + "epoch": 0.3631692806922661, + "grad_norm": 2.578125, + "learning_rate": 0.01991461706626853, + "loss": 3.465, + "mean_token_accuracy": 0.3617940843105316, + "num_tokens": 686885738.0, + "step": 1343 + }, + { + "epoch": 0.3634396971335857, + "grad_norm": 3.484375, + "learning_rate": 0.019914400289932393, + "loss": 3.5208, + "mean_token_accuracy": 0.3670973479747772, + "num_tokens": 687409909.0, + "step": 1344 + }, + { + "epoch": 0.36371011357490535, + "grad_norm": 2.578125, + "learning_rate": 0.01991418324007624, + "loss": 3.2537, + "mean_token_accuracy": 0.38986989855766296, + "num_tokens": 687934151.0, + "step": 1345 + }, + { + "epoch": 0.363980530016225, + "grad_norm": 2.859375, + "learning_rate": 0.019913965916706734, + "loss": 3.2357, + "mean_token_accuracy": 0.3841405212879181, + "num_tokens": 688458431.0, + "step": 1346 + }, + { + "epoch": 0.36425094645754463, + "grad_norm": 3.8125, + "learning_rate": 0.019913748319830536, + "loss": 3.5078, + "mean_token_accuracy": 0.37097877264022827, + "num_tokens": 688982647.0, + "step": 1347 + }, + { + "epoch": 0.3645213628988643, + "grad_norm": 3.984375, + "learning_rate": 0.019913530449454323, + "loss": 3.5696, + "mean_token_accuracy": 0.3524726331233978, + "num_tokens": 689506783.0, + "step": 1348 + }, + { + "epoch": 0.36479177934018386, + "grad_norm": 2.4375, + "learning_rate": 0.01991331230558479, + "loss": 3.2602, + "mean_token_accuracy": 0.3814830780029297, + "num_tokens": 690030866.0, + "step": 1349 + }, + { + "epoch": 0.3650621957815035, + "grad_norm": 2.75, + "learning_rate": 0.01991309388822862, + "loss": 3.3985, + "mean_token_accuracy": 0.36048948764801025, + "num_tokens": 690555004.0, + "step": 1350 + }, + { + "epoch": 0.36533261222282315, + "grad_norm": 274.0, + "learning_rate": 0.019912875197392512, + "loss": 20.9965, + "mean_token_accuracy": 0.000263180467300117, + "num_tokens": 691045095.0, + "step": 1351 + }, + { + "epoch": 0.3656030286641428, + "grad_norm": 6.3125, + "learning_rate": 0.01991265623308319, + "loss": 3.9587, + "mean_token_accuracy": 0.3373894691467285, + "num_tokens": 691569260.0, + "step": 1352 + }, + { + "epoch": 0.36587344510546244, + "grad_norm": 2.125, + "learning_rate": 0.01991243699530736, + "loss": 3.6137, + "mean_token_accuracy": 0.36037349700927734, + "num_tokens": 692062294.0, + "step": 1353 + }, + { + "epoch": 0.366143861546782, + "grad_norm": 2.671875, + "learning_rate": 0.019912217484071754, + "loss": 3.3859, + "mean_token_accuracy": 0.36611247062683105, + "num_tokens": 692586468.0, + "step": 1354 + }, + { + "epoch": 0.36641427798810167, + "grad_norm": 3.75, + "learning_rate": 0.019911997699383108, + "loss": 3.1146, + "mean_token_accuracy": 0.3583737015724182, + "num_tokens": 693110599.0, + "step": 1355 + }, + { + "epoch": 0.3666846944294213, + "grad_norm": 2.625, + "learning_rate": 0.01991177764124816, + "loss": 3.5375, + "mean_token_accuracy": 0.36661863327026367, + "num_tokens": 693634753.0, + "step": 1356 + }, + { + "epoch": 0.36695511087074095, + "grad_norm": 3.4375, + "learning_rate": 0.019911557309673672, + "loss": 3.6716, + "mean_token_accuracy": 0.35658353567123413, + "num_tokens": 694120320.0, + "step": 1357 + }, + { + "epoch": 0.3672255273120606, + "grad_norm": 3.25, + "learning_rate": 0.019911336704666393, + "loss": 3.3075, + "mean_token_accuracy": 0.3542656898498535, + "num_tokens": 694644526.0, + "step": 1358 + }, + { + "epoch": 0.3674959437533802, + "grad_norm": 3.0625, + "learning_rate": 0.019911115826233103, + "loss": 3.2547, + "mean_token_accuracy": 0.36892616748809814, + "num_tokens": 695168543.0, + "step": 1359 + }, + { + "epoch": 0.3677663601946998, + "grad_norm": 2.453125, + "learning_rate": 0.019910894674380567, + "loss": 3.1873, + "mean_token_accuracy": 0.4022481143474579, + "num_tokens": 695663738.0, + "step": 1360 + }, + { + "epoch": 0.36803677663601947, + "grad_norm": 3.15625, + "learning_rate": 0.019910673249115583, + "loss": 3.4623, + "mean_token_accuracy": 0.3598417341709137, + "num_tokens": 696187978.0, + "step": 1361 + }, + { + "epoch": 0.3683071930773391, + "grad_norm": 2.984375, + "learning_rate": 0.019910451550444935, + "loss": 3.4929, + "mean_token_accuracy": 0.3635568618774414, + "num_tokens": 696712186.0, + "step": 1362 + }, + { + "epoch": 0.36857760951865876, + "grad_norm": 3.671875, + "learning_rate": 0.019910229578375432, + "loss": 3.6149, + "mean_token_accuracy": 0.33599603176116943, + "num_tokens": 697236457.0, + "step": 1363 + }, + { + "epoch": 0.36884802595997834, + "grad_norm": 2.421875, + "learning_rate": 0.01991000733291388, + "loss": 3.1108, + "mean_token_accuracy": 0.3946317434310913, + "num_tokens": 697712093.0, + "step": 1364 + }, + { + "epoch": 0.369118442401298, + "grad_norm": 3.609375, + "learning_rate": 0.019909784814067105, + "loss": 3.1873, + "mean_token_accuracy": 0.3661254942417145, + "num_tokens": 698236297.0, + "step": 1365 + }, + { + "epoch": 0.36938885884261763, + "grad_norm": 2.8125, + "learning_rate": 0.019909562021841926, + "loss": 3.4129, + "mean_token_accuracy": 0.4120672643184662, + "num_tokens": 698642779.0, + "step": 1366 + }, + { + "epoch": 0.36965927528393727, + "grad_norm": 3.734375, + "learning_rate": 0.019909338956245186, + "loss": 3.4512, + "mean_token_accuracy": 0.3642173409461975, + "num_tokens": 699166979.0, + "step": 1367 + }, + { + "epoch": 0.3699296917252569, + "grad_norm": 3.0, + "learning_rate": 0.019909115617283722, + "loss": 3.631, + "mean_token_accuracy": 0.3528635799884796, + "num_tokens": 699691247.0, + "step": 1368 + }, + { + "epoch": 0.3702001081665765, + "grad_norm": 18.0, + "learning_rate": 0.019908892004964394, + "loss": 3.3553, + "mean_token_accuracy": 0.3675241768360138, + "num_tokens": 700156098.0, + "step": 1369 + }, + { + "epoch": 0.37047052460789615, + "grad_norm": 3.671875, + "learning_rate": 0.019908668119294058, + "loss": 3.7609, + "mean_token_accuracy": 0.3378659784793854, + "num_tokens": 700680250.0, + "step": 1370 + }, + { + "epoch": 0.3707409410492158, + "grad_norm": 77.5, + "learning_rate": 0.019908443960279586, + "loss": 12.2953, + "mean_token_accuracy": 0.013158997520804405, + "num_tokens": 701154117.0, + "step": 1371 + }, + { + "epoch": 0.37101135749053543, + "grad_norm": 8.5625, + "learning_rate": 0.019908219527927856, + "loss": 4.0836, + "mean_token_accuracy": 0.30981171131134033, + "num_tokens": 701678323.0, + "step": 1372 + }, + { + "epoch": 0.3712817739318551, + "grad_norm": 2.46875, + "learning_rate": 0.019907994822245752, + "loss": 3.6094, + "mean_token_accuracy": 0.3562849760055542, + "num_tokens": 702202587.0, + "step": 1373 + }, + { + "epoch": 0.37155219037317466, + "grad_norm": 2.625, + "learning_rate": 0.01990776984324017, + "loss": 3.6477, + "mean_token_accuracy": 0.35685378313064575, + "num_tokens": 702644138.0, + "step": 1374 + }, + { + "epoch": 0.3718226068144943, + "grad_norm": 4.625, + "learning_rate": 0.019907544590918015, + "loss": 3.6157, + "mean_token_accuracy": 0.34441861510276794, + "num_tokens": 703128397.0, + "step": 1375 + }, + { + "epoch": 0.37209302325581395, + "grad_norm": 3.671875, + "learning_rate": 0.019907319065286196, + "loss": 3.4925, + "mean_token_accuracy": 0.36116886138916016, + "num_tokens": 703588793.0, + "step": 1376 + }, + { + "epoch": 0.3723634396971336, + "grad_norm": 3.859375, + "learning_rate": 0.019907093266351636, + "loss": 3.6886, + "mean_token_accuracy": 0.339912474155426, + "num_tokens": 704112848.0, + "step": 1377 + }, + { + "epoch": 0.37263385613845323, + "grad_norm": 2.78125, + "learning_rate": 0.019906867194121255, + "loss": 3.5169, + "mean_token_accuracy": 0.37140727043151855, + "num_tokens": 704600608.0, + "step": 1378 + }, + { + "epoch": 0.3729042725797729, + "grad_norm": 2.78125, + "learning_rate": 0.019906640848601997, + "loss": 3.3814, + "mean_token_accuracy": 0.3686630129814148, + "num_tokens": 705124888.0, + "step": 1379 + }, + { + "epoch": 0.37317468902109246, + "grad_norm": 2.828125, + "learning_rate": 0.019906414229800808, + "loss": 3.4661, + "mean_token_accuracy": 0.350530207157135, + "num_tokens": 705648937.0, + "step": 1380 + }, + { + "epoch": 0.3734451054624121, + "grad_norm": 2.921875, + "learning_rate": 0.01990618733772464, + "loss": 3.2475, + "mean_token_accuracy": 0.36570242047309875, + "num_tokens": 706159306.0, + "step": 1381 + }, + { + "epoch": 0.37371552190373175, + "grad_norm": 2.96875, + "learning_rate": 0.019905960172380447, + "loss": 3.4249, + "mean_token_accuracy": 0.352244108915329, + "num_tokens": 706683467.0, + "step": 1382 + }, + { + "epoch": 0.3739859383450514, + "grad_norm": 2.890625, + "learning_rate": 0.019905732733775203, + "loss": 3.465, + "mean_token_accuracy": 0.3467180132865906, + "num_tokens": 707207735.0, + "step": 1383 + }, + { + "epoch": 0.37425635478637104, + "grad_norm": 3.1875, + "learning_rate": 0.019905505021915897, + "loss": 3.6219, + "mean_token_accuracy": 0.3508968651294708, + "num_tokens": 707731812.0, + "step": 1384 + }, + { + "epoch": 0.3745267712276906, + "grad_norm": 3.140625, + "learning_rate": 0.019905277036809502, + "loss": 3.4705, + "mean_token_accuracy": 0.36645984649658203, + "num_tokens": 708256085.0, + "step": 1385 + }, + { + "epoch": 0.37479718766901027, + "grad_norm": 3.21875, + "learning_rate": 0.01990504877846302, + "loss": 3.3913, + "mean_token_accuracy": 0.3797902762889862, + "num_tokens": 708780310.0, + "step": 1386 + }, + { + "epoch": 0.3750676041103299, + "grad_norm": 2.671875, + "learning_rate": 0.019904820246883455, + "loss": 3.3013, + "mean_token_accuracy": 0.37688523530960083, + "num_tokens": 709259529.0, + "step": 1387 + }, + { + "epoch": 0.37533802055164955, + "grad_norm": 3.03125, + "learning_rate": 0.019904591442077815, + "loss": 3.3882, + "mean_token_accuracy": 0.40675830841064453, + "num_tokens": 709719614.0, + "step": 1388 + }, + { + "epoch": 0.3756084369929692, + "grad_norm": 3.625, + "learning_rate": 0.019904362364053126, + "loss": 3.5666, + "mean_token_accuracy": 0.3482452630996704, + "num_tokens": 710243877.0, + "step": 1389 + }, + { + "epoch": 0.3758788534342888, + "grad_norm": 2.71875, + "learning_rate": 0.019904133012816412, + "loss": 3.4607, + "mean_token_accuracy": 0.36725878715515137, + "num_tokens": 710768115.0, + "step": 1390 + }, + { + "epoch": 0.3761492698756084, + "grad_norm": 181.0, + "learning_rate": 0.01990390338837471, + "loss": 40.5896, + "mean_token_accuracy": 5.287446401780471e-05, + "num_tokens": 711226964.0, + "step": 1391 + }, + { + "epoch": 0.37641968631692807, + "grad_norm": 9.125, + "learning_rate": 0.01990367349073507, + "loss": 3.8583, + "mean_token_accuracy": 0.31700167059898376, + "num_tokens": 711736456.0, + "step": 1392 + }, + { + "epoch": 0.3766901027582477, + "grad_norm": 2.890625, + "learning_rate": 0.01990344331990454, + "loss": 3.4, + "mean_token_accuracy": 0.3587205111980438, + "num_tokens": 712254496.0, + "step": 1393 + }, + { + "epoch": 0.37696051919956736, + "grad_norm": 2.828125, + "learning_rate": 0.01990321287589019, + "loss": 3.5789, + "mean_token_accuracy": 0.3540540635585785, + "num_tokens": 712778766.0, + "step": 1394 + }, + { + "epoch": 0.37723093564088694, + "grad_norm": 4.0, + "learning_rate": 0.019902982158699085, + "loss": 3.4352, + "mean_token_accuracy": 0.350885272026062, + "num_tokens": 713302819.0, + "step": 1395 + }, + { + "epoch": 0.3775013520822066, + "grad_norm": 3.09375, + "learning_rate": 0.019902751168338305, + "loss": 3.5832, + "mean_token_accuracy": 0.3800513744354248, + "num_tokens": 713826971.0, + "step": 1396 + }, + { + "epoch": 0.37777176852352623, + "grad_norm": 3.0, + "learning_rate": 0.01990251990481494, + "loss": 3.4182, + "mean_token_accuracy": 0.3540414273738861, + "num_tokens": 714351225.0, + "step": 1397 + }, + { + "epoch": 0.3780421849648459, + "grad_norm": 2.640625, + "learning_rate": 0.019902288368136078, + "loss": 3.4228, + "mean_token_accuracy": 0.3681761622428894, + "num_tokens": 714875500.0, + "step": 1398 + }, + { + "epoch": 0.3783126014061655, + "grad_norm": 4.78125, + "learning_rate": 0.019902056558308834, + "loss": 3.6679, + "mean_token_accuracy": 0.32647600769996643, + "num_tokens": 715355388.0, + "step": 1399 + }, + { + "epoch": 0.3785830178474851, + "grad_norm": 2.109375, + "learning_rate": 0.019901824475340314, + "loss": 3.538, + "mean_token_accuracy": 0.3474379777908325, + "num_tokens": 715879671.0, + "step": 1400 + }, + { + "epoch": 0.37885343428880475, + "grad_norm": 3.109375, + "learning_rate": 0.01990159211923764, + "loss": 3.2567, + "mean_token_accuracy": 0.3884185552597046, + "num_tokens": 716403866.0, + "step": 1401 + }, + { + "epoch": 0.3791238507301244, + "grad_norm": 3.375, + "learning_rate": 0.01990135949000794, + "loss": 3.3845, + "mean_token_accuracy": 0.3925398290157318, + "num_tokens": 716928012.0, + "step": 1402 + }, + { + "epoch": 0.37939426717144403, + "grad_norm": 2.84375, + "learning_rate": 0.019901126587658358, + "loss": 3.3142, + "mean_token_accuracy": 0.36937493085861206, + "num_tokens": 717452109.0, + "step": 1403 + }, + { + "epoch": 0.3796646836127637, + "grad_norm": 3.4375, + "learning_rate": 0.019900893412196034, + "loss": 3.4157, + "mean_token_accuracy": 0.38644784688949585, + "num_tokens": 717913056.0, + "step": 1404 + }, + { + "epoch": 0.37993510005408326, + "grad_norm": 2.5, + "learning_rate": 0.019900659963628123, + "loss": 3.4091, + "mean_token_accuracy": 0.3675983250141144, + "num_tokens": 718437171.0, + "step": 1405 + }, + { + "epoch": 0.3802055164954029, + "grad_norm": 3.046875, + "learning_rate": 0.01990042624196179, + "loss": 3.6194, + "mean_token_accuracy": 0.37229079008102417, + "num_tokens": 718898418.0, + "step": 1406 + }, + { + "epoch": 0.38047593293672255, + "grad_norm": 2.296875, + "learning_rate": 0.01990019224720421, + "loss": 3.3179, + "mean_token_accuracy": 0.37509727478027344, + "num_tokens": 719422700.0, + "step": 1407 + }, + { + "epoch": 0.3807463493780422, + "grad_norm": 2.953125, + "learning_rate": 0.019899957979362552, + "loss": 3.4277, + "mean_token_accuracy": 0.3641212582588196, + "num_tokens": 719946987.0, + "step": 1408 + }, + { + "epoch": 0.38101676581936184, + "grad_norm": 2.765625, + "learning_rate": 0.01989972343844401, + "loss": 3.3549, + "mean_token_accuracy": 0.3814183473587036, + "num_tokens": 720471249.0, + "step": 1409 + }, + { + "epoch": 0.3812871822606814, + "grad_norm": 3.34375, + "learning_rate": 0.019899488624455782, + "loss": 3.4271, + "mean_token_accuracy": 0.3586549162864685, + "num_tokens": 720995454.0, + "step": 1410 + }, + { + "epoch": 0.38155759870200107, + "grad_norm": 101.0, + "learning_rate": 0.019899253537405074, + "loss": 14.4562, + "mean_token_accuracy": 1.0807169019244611e-05, + "num_tokens": 721519536.0, + "step": 1411 + }, + { + "epoch": 0.3818280151433207, + "grad_norm": 9.3125, + "learning_rate": 0.019899018177299093, + "loss": 3.8921, + "mean_token_accuracy": 0.3061416745185852, + "num_tokens": 722043729.0, + "step": 1412 + }, + { + "epoch": 0.38209843158464035, + "grad_norm": 2.578125, + "learning_rate": 0.019898782544145066, + "loss": 3.6146, + "mean_token_accuracy": 0.3669593930244446, + "num_tokens": 722567873.0, + "step": 1413 + }, + { + "epoch": 0.38236884802596, + "grad_norm": 4.71875, + "learning_rate": 0.01989854663795022, + "loss": 3.3814, + "mean_token_accuracy": 0.3904535174369812, + "num_tokens": 723092035.0, + "step": 1414 + }, + { + "epoch": 0.3826392644672796, + "grad_norm": 2.484375, + "learning_rate": 0.019898310458721792, + "loss": 3.5791, + "mean_token_accuracy": 0.3668602406978607, + "num_tokens": 723611857.0, + "step": 1415 + }, + { + "epoch": 0.3829096809085992, + "grad_norm": 3.4375, + "learning_rate": 0.019898074006467033, + "loss": 3.5679, + "mean_token_accuracy": 0.3381289839744568, + "num_tokens": 724123438.0, + "step": 1416 + }, + { + "epoch": 0.38318009734991887, + "grad_norm": 2.59375, + "learning_rate": 0.0198978372811932, + "loss": 3.3553, + "mean_token_accuracy": 0.3804802894592285, + "num_tokens": 724647574.0, + "step": 1417 + }, + { + "epoch": 0.3834505137912385, + "grad_norm": 3.109375, + "learning_rate": 0.019897600282907546, + "loss": 3.6242, + "mean_token_accuracy": 0.34273654222488403, + "num_tokens": 725147077.0, + "step": 1418 + }, + { + "epoch": 0.38372093023255816, + "grad_norm": 3.828125, + "learning_rate": 0.019897363011617354, + "loss": 3.6305, + "mean_token_accuracy": 0.34378841519355774, + "num_tokens": 725665083.0, + "step": 1419 + }, + { + "epoch": 0.3839913466738778, + "grad_norm": 3.5625, + "learning_rate": 0.0198971254673299, + "loss": 3.5723, + "mean_token_accuracy": 0.33615750074386597, + "num_tokens": 726189282.0, + "step": 1420 + }, + { + "epoch": 0.3842617631151974, + "grad_norm": 3.0625, + "learning_rate": 0.019896887650052465, + "loss": 3.365, + "mean_token_accuracy": 0.37385356426239014, + "num_tokens": 726686395.0, + "step": 1421 + }, + { + "epoch": 0.38453217955651703, + "grad_norm": 3.4375, + "learning_rate": 0.019896649559792357, + "loss": 3.238, + "mean_token_accuracy": 0.3727753758430481, + "num_tokens": 727210570.0, + "step": 1422 + }, + { + "epoch": 0.38480259599783667, + "grad_norm": 2.734375, + "learning_rate": 0.019896411196556878, + "loss": 3.5252, + "mean_token_accuracy": 0.3777249753475189, + "num_tokens": 727675706.0, + "step": 1423 + }, + { + "epoch": 0.3850730124391563, + "grad_norm": 3.078125, + "learning_rate": 0.01989617256035334, + "loss": 3.3575, + "mean_token_accuracy": 0.37656551599502563, + "num_tokens": 728199807.0, + "step": 1424 + }, + { + "epoch": 0.38534342888047596, + "grad_norm": 3.09375, + "learning_rate": 0.019895933651189066, + "loss": 3.1967, + "mean_token_accuracy": 0.4074188470840454, + "num_tokens": 728714117.0, + "step": 1425 + }, + { + "epoch": 0.38561384532179555, + "grad_norm": 3.84375, + "learning_rate": 0.019895694469071384, + "loss": 3.4517, + "mean_token_accuracy": 0.3184359669685364, + "num_tokens": 729238067.0, + "step": 1426 + }, + { + "epoch": 0.3858842617631152, + "grad_norm": 3.109375, + "learning_rate": 0.019895455014007638, + "loss": 3.6185, + "mean_token_accuracy": 0.35634341835975647, + "num_tokens": 729762167.0, + "step": 1427 + }, + { + "epoch": 0.38615467820443483, + "grad_norm": 3.484375, + "learning_rate": 0.019895215286005172, + "loss": 3.5787, + "mean_token_accuracy": 0.34644216299057007, + "num_tokens": 730286446.0, + "step": 1428 + }, + { + "epoch": 0.3864250946457545, + "grad_norm": 3.515625, + "learning_rate": 0.01989497528507134, + "loss": 3.3598, + "mean_token_accuracy": 0.3672473430633545, + "num_tokens": 730810644.0, + "step": 1429 + }, + { + "epoch": 0.3866955110870741, + "grad_norm": 3.34375, + "learning_rate": 0.01989473501121351, + "loss": 3.4286, + "mean_token_accuracy": 0.36695149540901184, + "num_tokens": 731334685.0, + "step": 1430 + }, + { + "epoch": 0.3869659275283937, + "grad_norm": 48.25, + "learning_rate": 0.019894494464439052, + "loss": 10.6363, + "mean_token_accuracy": 0.01600603200495243, + "num_tokens": 731847979.0, + "step": 1431 + }, + { + "epoch": 0.38723634396971335, + "grad_norm": 7.375, + "learning_rate": 0.01989425364475535, + "loss": 4.2662, + "mean_token_accuracy": 0.2983320951461792, + "num_tokens": 732315331.0, + "step": 1432 + }, + { + "epoch": 0.387506760411033, + "grad_norm": 2.59375, + "learning_rate": 0.019894012552169785, + "loss": 3.486, + "mean_token_accuracy": 0.37876543402671814, + "num_tokens": 732814345.0, + "step": 1433 + }, + { + "epoch": 0.38777717685235263, + "grad_norm": 3.140625, + "learning_rate": 0.01989377118668976, + "loss": 3.6355, + "mean_token_accuracy": 0.3590705394744873, + "num_tokens": 733328121.0, + "step": 1434 + }, + { + "epoch": 0.3880475932936723, + "grad_norm": 3.25, + "learning_rate": 0.01989352954832268, + "loss": 3.4277, + "mean_token_accuracy": 0.3538752794265747, + "num_tokens": 733852256.0, + "step": 1435 + }, + { + "epoch": 0.38831800973499186, + "grad_norm": 3.546875, + "learning_rate": 0.01989328763707596, + "loss": 3.3716, + "mean_token_accuracy": 0.3602520823478699, + "num_tokens": 734376403.0, + "step": 1436 + }, + { + "epoch": 0.3885884261763115, + "grad_norm": 2.53125, + "learning_rate": 0.01989304545295702, + "loss": 3.6592, + "mean_token_accuracy": 0.3438284993171692, + "num_tokens": 734886494.0, + "step": 1437 + }, + { + "epoch": 0.38885884261763115, + "grad_norm": 2.9375, + "learning_rate": 0.019892802995973295, + "loss": 3.1657, + "mean_token_accuracy": 0.37631070613861084, + "num_tokens": 735363739.0, + "step": 1438 + }, + { + "epoch": 0.3891292590589508, + "grad_norm": 2.546875, + "learning_rate": 0.019892560266132224, + "loss": 3.1579, + "mean_token_accuracy": 0.3936833143234253, + "num_tokens": 735887924.0, + "step": 1439 + }, + { + "epoch": 0.38939967550027044, + "grad_norm": 2.765625, + "learning_rate": 0.01989231726344125, + "loss": 3.4072, + "mean_token_accuracy": 0.3759201765060425, + "num_tokens": 736370313.0, + "step": 1440 + }, + { + "epoch": 0.38967009194159, + "grad_norm": 2.90625, + "learning_rate": 0.01989207398790783, + "loss": 3.336, + "mean_token_accuracy": 0.38233721256256104, + "num_tokens": 736894570.0, + "step": 1441 + }, + { + "epoch": 0.38994050838290967, + "grad_norm": 2.59375, + "learning_rate": 0.019891830439539432, + "loss": 3.322, + "mean_token_accuracy": 0.3752642273902893, + "num_tokens": 737418844.0, + "step": 1442 + }, + { + "epoch": 0.3902109248242293, + "grad_norm": 4.9375, + "learning_rate": 0.019891586618343532, + "loss": 3.7784, + "mean_token_accuracy": 0.3396183252334595, + "num_tokens": 737907592.0, + "step": 1443 + }, + { + "epoch": 0.39048134126554895, + "grad_norm": 3.046875, + "learning_rate": 0.019891342524327598, + "loss": 3.1974, + "mean_token_accuracy": 0.3757363557815552, + "num_tokens": 738431761.0, + "step": 1444 + }, + { + "epoch": 0.3907517577068686, + "grad_norm": 2.734375, + "learning_rate": 0.01989109815749913, + "loss": 3.5655, + "mean_token_accuracy": 0.3604559898376465, + "num_tokens": 738943244.0, + "step": 1445 + }, + { + "epoch": 0.3910221741481882, + "grad_norm": 3.40625, + "learning_rate": 0.01989085351786563, + "loss": 3.6286, + "mean_token_accuracy": 0.3448333144187927, + "num_tokens": 739467349.0, + "step": 1446 + }, + { + "epoch": 0.3912925905895078, + "grad_norm": 3.265625, + "learning_rate": 0.01989060860543459, + "loss": 3.4195, + "mean_token_accuracy": 0.36147406697273254, + "num_tokens": 739987227.0, + "step": 1447 + }, + { + "epoch": 0.39156300703082747, + "grad_norm": 2.765625, + "learning_rate": 0.019890363420213532, + "loss": 3.2124, + "mean_token_accuracy": 0.3830398917198181, + "num_tokens": 740511213.0, + "step": 1448 + }, + { + "epoch": 0.3918334234721471, + "grad_norm": 2.6875, + "learning_rate": 0.019890117962209984, + "loss": 3.3081, + "mean_token_accuracy": 0.3899041414260864, + "num_tokens": 740975710.0, + "step": 1449 + }, + { + "epoch": 0.39210383991346676, + "grad_norm": 2.5625, + "learning_rate": 0.01988987223143147, + "loss": 3.5573, + "mean_token_accuracy": 0.36107105016708374, + "num_tokens": 741489983.0, + "step": 1450 + }, + { + "epoch": 0.39237425635478634, + "grad_norm": 225.0, + "learning_rate": 0.019889626227885534, + "loss": 12.3973, + "mean_token_accuracy": 0.03868263214826584, + "num_tokens": 741954053.0, + "step": 1451 + }, + { + "epoch": 0.392644672796106, + "grad_norm": 10.4375, + "learning_rate": 0.01988937995157972, + "loss": 3.9149, + "mean_token_accuracy": 0.2975747585296631, + "num_tokens": 742478124.0, + "step": 1452 + }, + { + "epoch": 0.39291508923742563, + "grad_norm": 2.90625, + "learning_rate": 0.019889133402521592, + "loss": 3.7469, + "mean_token_accuracy": 0.331814169883728, + "num_tokens": 743002309.0, + "step": 1453 + }, + { + "epoch": 0.3931855056787453, + "grad_norm": 2.5, + "learning_rate": 0.019888886580718707, + "loss": 3.5975, + "mean_token_accuracy": 0.357155978679657, + "num_tokens": 743526551.0, + "step": 1454 + }, + { + "epoch": 0.3934559221200649, + "grad_norm": 2.84375, + "learning_rate": 0.019888639486178642, + "loss": 3.2066, + "mean_token_accuracy": 0.379894882440567, + "num_tokens": 744050829.0, + "step": 1455 + }, + { + "epoch": 0.39372633856138456, + "grad_norm": 3.203125, + "learning_rate": 0.019888392118908978, + "loss": 3.5186, + "mean_token_accuracy": 0.331778347492218, + "num_tokens": 744575112.0, + "step": 1456 + }, + { + "epoch": 0.39399675500270415, + "grad_norm": 3.546875, + "learning_rate": 0.0198881444789173, + "loss": 3.5634, + "mean_token_accuracy": 0.35876452922821045, + "num_tokens": 745099141.0, + "step": 1457 + }, + { + "epoch": 0.3942671714440238, + "grad_norm": 3.1875, + "learning_rate": 0.019887896566211215, + "loss": 3.3689, + "mean_token_accuracy": 0.37053394317626953, + "num_tokens": 745623405.0, + "step": 1458 + }, + { + "epoch": 0.39453758788534343, + "grad_norm": 2.625, + "learning_rate": 0.019887648380798328, + "loss": 3.2263, + "mean_token_accuracy": 0.36459341645240784, + "num_tokens": 746147571.0, + "step": 1459 + }, + { + "epoch": 0.3948080043266631, + "grad_norm": 3.390625, + "learning_rate": 0.019887399922686247, + "loss": 3.6888, + "mean_token_accuracy": 0.35249632596969604, + "num_tokens": 746617213.0, + "step": 1460 + }, + { + "epoch": 0.3950784207679827, + "grad_norm": 4.0625, + "learning_rate": 0.019887151191882606, + "loss": 3.7332, + "mean_token_accuracy": 0.3701634407043457, + "num_tokens": 747141433.0, + "step": 1461 + }, + { + "epoch": 0.3953488372093023, + "grad_norm": 2.640625, + "learning_rate": 0.019886902188395027, + "loss": 3.406, + "mean_token_accuracy": 0.3573497235774994, + "num_tokens": 747621056.0, + "step": 1462 + }, + { + "epoch": 0.39561925365062195, + "grad_norm": 2.65625, + "learning_rate": 0.019886652912231153, + "loss": 3.4425, + "mean_token_accuracy": 0.3664283752441406, + "num_tokens": 748145328.0, + "step": 1463 + }, + { + "epoch": 0.3958896700919416, + "grad_norm": 3.0, + "learning_rate": 0.019886403363398635, + "loss": 3.4947, + "mean_token_accuracy": 0.3726044297218323, + "num_tokens": 748639856.0, + "step": 1464 + }, + { + "epoch": 0.39616008653326124, + "grad_norm": 3.65625, + "learning_rate": 0.019886153541905132, + "loss": 3.634, + "mean_token_accuracy": 0.34484627842903137, + "num_tokens": 749087637.0, + "step": 1465 + }, + { + "epoch": 0.3964305029745809, + "grad_norm": 2.84375, + "learning_rate": 0.0198859034477583, + "loss": 3.2469, + "mean_token_accuracy": 0.3622376322746277, + "num_tokens": 749611760.0, + "step": 1466 + }, + { + "epoch": 0.39670091941590047, + "grad_norm": 3.140625, + "learning_rate": 0.01988565308096582, + "loss": 3.3688, + "mean_token_accuracy": 0.3658416271209717, + "num_tokens": 750135932.0, + "step": 1467 + }, + { + "epoch": 0.3969713358572201, + "grad_norm": 2.546875, + "learning_rate": 0.019885402441535378, + "loss": 3.5641, + "mean_token_accuracy": 0.3505168557167053, + "num_tokens": 750660132.0, + "step": 1468 + }, + { + "epoch": 0.39724175229853975, + "grad_norm": 3.046875, + "learning_rate": 0.019885151529474655, + "loss": 3.3384, + "mean_token_accuracy": 0.3918367624282837, + "num_tokens": 751184350.0, + "step": 1469 + }, + { + "epoch": 0.3975121687398594, + "grad_norm": 2.5625, + "learning_rate": 0.019884900344791352, + "loss": 3.4198, + "mean_token_accuracy": 0.37039902806282043, + "num_tokens": 751708378.0, + "step": 1470 + }, + { + "epoch": 0.39778258518117904, + "grad_norm": 58.75, + "learning_rate": 0.01988464888749318, + "loss": 11.7454, + "mean_token_accuracy": 4.1866009269142523e-05, + "num_tokens": 752206726.0, + "step": 1471 + }, + { + "epoch": 0.3980530016224986, + "grad_norm": 6.53125, + "learning_rate": 0.019884397157587847, + "loss": 4.1156, + "mean_token_accuracy": 0.3089691996574402, + "num_tokens": 752731007.0, + "step": 1472 + }, + { + "epoch": 0.39832341806381827, + "grad_norm": 2.46875, + "learning_rate": 0.019884145155083088, + "loss": 3.5798, + "mean_token_accuracy": 0.3593329191207886, + "num_tokens": 753255191.0, + "step": 1473 + }, + { + "epoch": 0.3985938345051379, + "grad_norm": 3.890625, + "learning_rate": 0.01988389287998663, + "loss": 3.6269, + "mean_token_accuracy": 0.341495156288147, + "num_tokens": 753779354.0, + "step": 1474 + }, + { + "epoch": 0.39886425094645755, + "grad_norm": 3.703125, + "learning_rate": 0.019883640332306208, + "loss": 3.4029, + "mean_token_accuracy": 0.377032071352005, + "num_tokens": 754303624.0, + "step": 1475 + }, + { + "epoch": 0.3991346673877772, + "grad_norm": 3.84375, + "learning_rate": 0.01988338751204958, + "loss": 3.4891, + "mean_token_accuracy": 0.3553547263145447, + "num_tokens": 754827797.0, + "step": 1476 + }, + { + "epoch": 0.3994050838290968, + "grad_norm": 2.6875, + "learning_rate": 0.019883134419224492, + "loss": 3.4947, + "mean_token_accuracy": 0.3418956995010376, + "num_tokens": 755351856.0, + "step": 1477 + }, + { + "epoch": 0.39967550027041643, + "grad_norm": 2.890625, + "learning_rate": 0.019882881053838723, + "loss": 3.3464, + "mean_token_accuracy": 0.3587391972541809, + "num_tokens": 755876054.0, + "step": 1478 + }, + { + "epoch": 0.39994591671173607, + "grad_norm": 2.15625, + "learning_rate": 0.019882627415900037, + "loss": 3.3649, + "mean_token_accuracy": 0.3567681312561035, + "num_tokens": 756400136.0, + "step": 1479 + }, + { + "epoch": 0.4002163331530557, + "grad_norm": 2.84375, + "learning_rate": 0.01988237350541622, + "loss": 3.4509, + "mean_token_accuracy": 0.35660427808761597, + "num_tokens": 756924184.0, + "step": 1480 + }, + { + "epoch": 0.40048674959437536, + "grad_norm": 2.71875, + "learning_rate": 0.019882119322395062, + "loss": 3.4521, + "mean_token_accuracy": 0.38395994901657104, + "num_tokens": 757379561.0, + "step": 1481 + }, + { + "epoch": 0.40075716603569495, + "grad_norm": 3.75, + "learning_rate": 0.019881864866844366, + "loss": 3.434, + "mean_token_accuracy": 0.36584633588790894, + "num_tokens": 757903707.0, + "step": 1482 + }, + { + "epoch": 0.4010275824770146, + "grad_norm": 2.859375, + "learning_rate": 0.019881610138771932, + "loss": 3.0802, + "mean_token_accuracy": 0.38866105675697327, + "num_tokens": 758427893.0, + "step": 1483 + }, + { + "epoch": 0.40129799891833423, + "grad_norm": 3.078125, + "learning_rate": 0.01988135513818558, + "loss": 3.5956, + "mean_token_accuracy": 0.35042804479599, + "num_tokens": 758952168.0, + "step": 1484 + }, + { + "epoch": 0.4015684153596539, + "grad_norm": 3.375, + "learning_rate": 0.01988109986509314, + "loss": 3.4139, + "mean_token_accuracy": 0.37084704637527466, + "num_tokens": 759476361.0, + "step": 1485 + }, + { + "epoch": 0.4018388318009735, + "grad_norm": 3.5, + "learning_rate": 0.01988084431950243, + "loss": 3.2551, + "mean_token_accuracy": 0.36532142758369446, + "num_tokens": 760000522.0, + "step": 1486 + }, + { + "epoch": 0.4021092482422931, + "grad_norm": 2.34375, + "learning_rate": 0.019880588501421305, + "loss": 3.1776, + "mean_token_accuracy": 0.39631104469299316, + "num_tokens": 760478294.0, + "step": 1487 + }, + { + "epoch": 0.40237966468361275, + "grad_norm": 3.171875, + "learning_rate": 0.019880332410857605, + "loss": 3.3969, + "mean_token_accuracy": 0.3744358718395233, + "num_tokens": 761002489.0, + "step": 1488 + }, + { + "epoch": 0.4026500811249324, + "grad_norm": 2.390625, + "learning_rate": 0.019880076047819197, + "loss": 3.2042, + "mean_token_accuracy": 0.38536137342453003, + "num_tokens": 761526520.0, + "step": 1489 + }, + { + "epoch": 0.40292049756625203, + "grad_norm": 5.46875, + "learning_rate": 0.019879819412313936, + "loss": 2.9878, + "mean_token_accuracy": 0.4277969300746918, + "num_tokens": 762050767.0, + "step": 1490 + }, + { + "epoch": 0.4031909140075717, + "grad_norm": 64.5, + "learning_rate": 0.0198795625043497, + "loss": 13.5912, + "mean_token_accuracy": 0.006582094356417656, + "num_tokens": 762551556.0, + "step": 1491 + }, + { + "epoch": 0.4034613304488913, + "grad_norm": 4.9375, + "learning_rate": 0.01987930532393438, + "loss": 3.7777, + "mean_token_accuracy": 0.33189624547958374, + "num_tokens": 763074209.0, + "step": 1492 + }, + { + "epoch": 0.4037317468902109, + "grad_norm": 2.484375, + "learning_rate": 0.019879047871075858, + "loss": 3.2343, + "mean_token_accuracy": 0.3730837106704712, + "num_tokens": 763598468.0, + "step": 1493 + }, + { + "epoch": 0.40400216333153055, + "grad_norm": 3.15625, + "learning_rate": 0.019878790145782033, + "loss": 3.5019, + "mean_token_accuracy": 0.36024391651153564, + "num_tokens": 764122701.0, + "step": 1494 + }, + { + "epoch": 0.4042725797728502, + "grad_norm": 4.34375, + "learning_rate": 0.019878532148060818, + "loss": 3.6533, + "mean_token_accuracy": 0.3612651824951172, + "num_tokens": 764586038.0, + "step": 1495 + }, + { + "epoch": 0.40454299621416984, + "grad_norm": 4.375, + "learning_rate": 0.019878273877920124, + "loss": 3.3373, + "mean_token_accuracy": 0.3806052803993225, + "num_tokens": 765110235.0, + "step": 1496 + }, + { + "epoch": 0.4048134126554895, + "grad_norm": 3.15625, + "learning_rate": 0.01987801533536788, + "loss": 3.4804, + "mean_token_accuracy": 0.3589802384376526, + "num_tokens": 765634352.0, + "step": 1497 + }, + { + "epoch": 0.40508382909680907, + "grad_norm": 3.125, + "learning_rate": 0.01987775652041202, + "loss": 3.4553, + "mean_token_accuracy": 0.37021422386169434, + "num_tokens": 766158603.0, + "step": 1498 + }, + { + "epoch": 0.4053542455381287, + "grad_norm": 3.109375, + "learning_rate": 0.019877497433060477, + "loss": 3.4603, + "mean_token_accuracy": 0.34604281187057495, + "num_tokens": 766682806.0, + "step": 1499 + }, + { + "epoch": 0.40562466197944835, + "grad_norm": 2.953125, + "learning_rate": 0.019877238073321205, + "loss": 3.3981, + "mean_token_accuracy": 0.4026448130607605, + "num_tokens": 767207088.0, + "step": 1500 + }, + { + "epoch": 0.405895078420768, + "grad_norm": 2.875, + "learning_rate": 0.01987697844120217, + "loss": 3.2286, + "mean_token_accuracy": 0.36806008219718933, + "num_tokens": 767731350.0, + "step": 1501 + }, + { + "epoch": 0.40616549486208764, + "grad_norm": 3.671875, + "learning_rate": 0.019876718536711326, + "loss": 3.6016, + "mean_token_accuracy": 0.35974937677383423, + "num_tokens": 768255610.0, + "step": 1502 + }, + { + "epoch": 0.4064359113034072, + "grad_norm": 3.71875, + "learning_rate": 0.01987645835985665, + "loss": 3.4162, + "mean_token_accuracy": 0.3622455894947052, + "num_tokens": 768753345.0, + "step": 1503 + }, + { + "epoch": 0.40670632774472687, + "grad_norm": 2.859375, + "learning_rate": 0.01987619791064613, + "loss": 3.2743, + "mean_token_accuracy": 0.38594985008239746, + "num_tokens": 769277600.0, + "step": 1504 + }, + { + "epoch": 0.4069767441860465, + "grad_norm": 3.171875, + "learning_rate": 0.019875937189087756, + "loss": 3.2455, + "mean_token_accuracy": 0.3907596468925476, + "num_tokens": 769704216.0, + "step": 1505 + }, + { + "epoch": 0.40724716062736616, + "grad_norm": 2.078125, + "learning_rate": 0.019875676195189526, + "loss": 3.3499, + "mean_token_accuracy": 0.3790913224220276, + "num_tokens": 770228231.0, + "step": 1506 + }, + { + "epoch": 0.4075175770686858, + "grad_norm": 2.796875, + "learning_rate": 0.01987541492895945, + "loss": 3.1672, + "mean_token_accuracy": 0.3862132728099823, + "num_tokens": 770752380.0, + "step": 1507 + }, + { + "epoch": 0.4077879935100054, + "grad_norm": 2.140625, + "learning_rate": 0.019875153390405544, + "loss": 3.3668, + "mean_token_accuracy": 0.3772655725479126, + "num_tokens": 771276626.0, + "step": 1508 + }, + { + "epoch": 0.40805840995132503, + "grad_norm": 4.375, + "learning_rate": 0.019874891579535824, + "loss": 3.6581, + "mean_token_accuracy": 0.36239174008369446, + "num_tokens": 771800832.0, + "step": 1509 + }, + { + "epoch": 0.4083288263926447, + "grad_norm": 2.734375, + "learning_rate": 0.019874629496358338, + "loss": 3.2538, + "mean_token_accuracy": 0.3934522569179535, + "num_tokens": 772315331.0, + "step": 1510 + }, + { + "epoch": 0.4085992428339643, + "grad_norm": 32.0, + "learning_rate": 0.01987436714088112, + "loss": 9.4995, + "mean_token_accuracy": 0.03907495737075806, + "num_tokens": 772777356.0, + "step": 1511 + }, + { + "epoch": 0.40886965927528396, + "grad_norm": 9.5, + "learning_rate": 0.01987410451311222, + "loss": 3.8761, + "mean_token_accuracy": 0.3451288342475891, + "num_tokens": 773301562.0, + "step": 1512 + }, + { + "epoch": 0.40914007571660355, + "grad_norm": 3.3125, + "learning_rate": 0.019873841613059696, + "loss": 3.6121, + "mean_token_accuracy": 0.3117557168006897, + "num_tokens": 773825836.0, + "step": 1513 + }, + { + "epoch": 0.4094104921579232, + "grad_norm": 3.203125, + "learning_rate": 0.019873578440731614, + "loss": 3.5443, + "mean_token_accuracy": 0.3659079074859619, + "num_tokens": 774298148.0, + "step": 1514 + }, + { + "epoch": 0.40968090859924283, + "grad_norm": 3.4375, + "learning_rate": 0.01987331499613605, + "loss": 3.581, + "mean_token_accuracy": 0.34825029969215393, + "num_tokens": 774811736.0, + "step": 1515 + }, + { + "epoch": 0.4099513250405625, + "grad_norm": 3.03125, + "learning_rate": 0.01987305127928109, + "loss": 3.3846, + "mean_token_accuracy": 0.36802348494529724, + "num_tokens": 775335934.0, + "step": 1516 + }, + { + "epoch": 0.4102217414818821, + "grad_norm": 2.84375, + "learning_rate": 0.01987278729017482, + "loss": 3.603, + "mean_token_accuracy": 0.3718014657497406, + "num_tokens": 775852943.0, + "step": 1517 + }, + { + "epoch": 0.4104921579232017, + "grad_norm": 2.734375, + "learning_rate": 0.01987252302882534, + "loss": 3.4278, + "mean_token_accuracy": 0.3437354564666748, + "num_tokens": 776377156.0, + "step": 1518 + }, + { + "epoch": 0.41076257436452135, + "grad_norm": 3.03125, + "learning_rate": 0.019872258495240765, + "loss": 3.4168, + "mean_token_accuracy": 0.3622482419013977, + "num_tokens": 776901333.0, + "step": 1519 + }, + { + "epoch": 0.411032990805841, + "grad_norm": 2.671875, + "learning_rate": 0.01987199368942921, + "loss": 3.3924, + "mean_token_accuracy": 0.3820647597312927, + "num_tokens": 777425208.0, + "step": 1520 + }, + { + "epoch": 0.41130340724716064, + "grad_norm": 3.25, + "learning_rate": 0.019871728611398794, + "loss": 3.4263, + "mean_token_accuracy": 0.3765425682067871, + "num_tokens": 777886211.0, + "step": 1521 + }, + { + "epoch": 0.4115738236884803, + "grad_norm": 2.96875, + "learning_rate": 0.019871463261157657, + "loss": 3.5214, + "mean_token_accuracy": 0.37197011709213257, + "num_tokens": 778387604.0, + "step": 1522 + }, + { + "epoch": 0.41184424012979987, + "grad_norm": 2.953125, + "learning_rate": 0.01987119763871394, + "loss": 3.3734, + "mean_token_accuracy": 0.37723204493522644, + "num_tokens": 778882741.0, + "step": 1523 + }, + { + "epoch": 0.4121146565711195, + "grad_norm": 3.015625, + "learning_rate": 0.01987093174407578, + "loss": 3.4778, + "mean_token_accuracy": 0.35562634468078613, + "num_tokens": 779406965.0, + "step": 1524 + }, + { + "epoch": 0.41238507301243915, + "grad_norm": 3.296875, + "learning_rate": 0.019870665577251357, + "loss": 3.4884, + "mean_token_accuracy": 0.36483392119407654, + "num_tokens": 779931209.0, + "step": 1525 + }, + { + "epoch": 0.4126554894537588, + "grad_norm": 2.875, + "learning_rate": 0.019870399138248826, + "loss": 3.3404, + "mean_token_accuracy": 0.34698009490966797, + "num_tokens": 780436066.0, + "step": 1526 + }, + { + "epoch": 0.41292590589507844, + "grad_norm": 2.796875, + "learning_rate": 0.01987013242707636, + "loss": 3.4817, + "mean_token_accuracy": 0.3605377674102783, + "num_tokens": 780960335.0, + "step": 1527 + }, + { + "epoch": 0.413196322336398, + "grad_norm": 3.546875, + "learning_rate": 0.019869865443742144, + "loss": 3.7891, + "mean_token_accuracy": 0.35488373041152954, + "num_tokens": 781437680.0, + "step": 1528 + }, + { + "epoch": 0.41346673877771767, + "grad_norm": 4.0625, + "learning_rate": 0.019869598188254376, + "loss": 3.3805, + "mean_token_accuracy": 0.3658300042152405, + "num_tokens": 781961833.0, + "step": 1529 + }, + { + "epoch": 0.4137371552190373, + "grad_norm": 2.546875, + "learning_rate": 0.01986933066062125, + "loss": 3.2738, + "mean_token_accuracy": 0.40342646837234497, + "num_tokens": 782428303.0, + "step": 1530 + }, + { + "epoch": 0.41400757166035695, + "grad_norm": 63.25, + "learning_rate": 0.019869062860850974, + "loss": 11.2589, + "mean_token_accuracy": 0.002866589231416583, + "num_tokens": 782952584.0, + "step": 1531 + }, + { + "epoch": 0.4142779881016766, + "grad_norm": 8.125, + "learning_rate": 0.01986879478895177, + "loss": 3.9929, + "mean_token_accuracy": 0.3074215054512024, + "num_tokens": 783468298.0, + "step": 1532 + }, + { + "epoch": 0.41454840454299624, + "grad_norm": 19.5, + "learning_rate": 0.019868526444931863, + "loss": 3.1158, + "mean_token_accuracy": 0.4504564702510834, + "num_tokens": 783992573.0, + "step": 1533 + }, + { + "epoch": 0.41481882098431583, + "grad_norm": 3.265625, + "learning_rate": 0.01986825782879948, + "loss": 3.5842, + "mean_token_accuracy": 0.32807809114456177, + "num_tokens": 784516662.0, + "step": 1534 + }, + { + "epoch": 0.41508923742563547, + "grad_norm": 3.25, + "learning_rate": 0.01986798894056287, + "loss": 3.5861, + "mean_token_accuracy": 0.3588079512119293, + "num_tokens": 785003148.0, + "step": 1535 + }, + { + "epoch": 0.4153596538669551, + "grad_norm": 3.359375, + "learning_rate": 0.01986771978023028, + "loss": 3.6395, + "mean_token_accuracy": 0.39402708411216736, + "num_tokens": 785464656.0, + "step": 1536 + }, + { + "epoch": 0.41563007030827476, + "grad_norm": 2.921875, + "learning_rate": 0.019867450347809963, + "loss": 3.69, + "mean_token_accuracy": 0.35992756485939026, + "num_tokens": 785988900.0, + "step": 1537 + }, + { + "epoch": 0.4159004867495944, + "grad_norm": 4.3125, + "learning_rate": 0.0198671806433102, + "loss": 3.51, + "mean_token_accuracy": 0.3737192153930664, + "num_tokens": 786513024.0, + "step": 1538 + }, + { + "epoch": 0.416170903190914, + "grad_norm": 2.671875, + "learning_rate": 0.019866910666739252, + "loss": 3.569, + "mean_token_accuracy": 0.3554387390613556, + "num_tokens": 787037207.0, + "step": 1539 + }, + { + "epoch": 0.41644131963223363, + "grad_norm": 3.375, + "learning_rate": 0.019866640418105413, + "loss": 3.3894, + "mean_token_accuracy": 0.366428405046463, + "num_tokens": 787561342.0, + "step": 1540 + }, + { + "epoch": 0.4167117360735533, + "grad_norm": 3.0, + "learning_rate": 0.019866369897416965, + "loss": 3.2668, + "mean_token_accuracy": 0.39138054847717285, + "num_tokens": 788000768.0, + "step": 1541 + }, + { + "epoch": 0.4169821525148729, + "grad_norm": 3.515625, + "learning_rate": 0.01986609910468222, + "loss": 3.5037, + "mean_token_accuracy": 0.37555164098739624, + "num_tokens": 788484973.0, + "step": 1542 + }, + { + "epoch": 0.41725256895619256, + "grad_norm": 3.0, + "learning_rate": 0.01986582803990948, + "loss": 3.4381, + "mean_token_accuracy": 0.39839935302734375, + "num_tokens": 788916439.0, + "step": 1543 + }, + { + "epoch": 0.41752298539751215, + "grad_norm": 2.921875, + "learning_rate": 0.01986555670310706, + "loss": 3.4387, + "mean_token_accuracy": 0.35857492685317993, + "num_tokens": 789440705.0, + "step": 1544 + }, + { + "epoch": 0.4177934018388318, + "grad_norm": 2.578125, + "learning_rate": 0.019865285094283293, + "loss": 3.3162, + "mean_token_accuracy": 0.38685622811317444, + "num_tokens": 789964870.0, + "step": 1545 + }, + { + "epoch": 0.41806381828015143, + "grad_norm": 2.640625, + "learning_rate": 0.019865013213446504, + "loss": 3.4762, + "mean_token_accuracy": 0.3655658960342407, + "num_tokens": 790489053.0, + "step": 1546 + }, + { + "epoch": 0.4183342347214711, + "grad_norm": 2.765625, + "learning_rate": 0.019864741060605044, + "loss": 3.4263, + "mean_token_accuracy": 0.3958280682563782, + "num_tokens": 791013243.0, + "step": 1547 + }, + { + "epoch": 0.4186046511627907, + "grad_norm": 3.9375, + "learning_rate": 0.019864468635767255, + "loss": 3.4183, + "mean_token_accuracy": 0.3640943169593811, + "num_tokens": 791537500.0, + "step": 1548 + }, + { + "epoch": 0.4188750676041103, + "grad_norm": 3.453125, + "learning_rate": 0.019864195938941502, + "loss": 3.4886, + "mean_token_accuracy": 0.3626924157142639, + "num_tokens": 792061671.0, + "step": 1549 + }, + { + "epoch": 0.41914548404542995, + "grad_norm": 3.6875, + "learning_rate": 0.019863922970136148, + "loss": 3.4591, + "mean_token_accuracy": 0.3751053214073181, + "num_tokens": 792542203.0, + "step": 1550 + }, + { + "epoch": 0.4194159004867496, + "grad_norm": 31.125, + "learning_rate": 0.019863649729359566, + "loss": 9.6426, + "mean_token_accuracy": 0.036502107977867126, + "num_tokens": 793066421.0, + "step": 1551 + }, + { + "epoch": 0.41968631692806924, + "grad_norm": 6.34375, + "learning_rate": 0.01986337621662015, + "loss": 3.997, + "mean_token_accuracy": 0.33747291564941406, + "num_tokens": 793590600.0, + "step": 1552 + }, + { + "epoch": 0.4199567333693889, + "grad_norm": 2.671875, + "learning_rate": 0.019863102431926287, + "loss": 3.5971, + "mean_token_accuracy": 0.3486028015613556, + "num_tokens": 794114867.0, + "step": 1553 + }, + { + "epoch": 0.42022714981070847, + "grad_norm": 3.015625, + "learning_rate": 0.019862828375286374, + "loss": 3.4922, + "mean_token_accuracy": 0.3794816732406616, + "num_tokens": 794639027.0, + "step": 1554 + }, + { + "epoch": 0.4204975662520281, + "grad_norm": 3.671875, + "learning_rate": 0.019862554046708822, + "loss": 3.3412, + "mean_token_accuracy": 0.3535584807395935, + "num_tokens": 795163277.0, + "step": 1555 + }, + { + "epoch": 0.42076798269334775, + "grad_norm": 3.140625, + "learning_rate": 0.01986227944620205, + "loss": 3.4294, + "mean_token_accuracy": 0.3643202483654022, + "num_tokens": 795687484.0, + "step": 1556 + }, + { + "epoch": 0.4210383991346674, + "grad_norm": 37.25, + "learning_rate": 0.019862004573774485, + "loss": 3.3165, + "mean_token_accuracy": 0.40506869554519653, + "num_tokens": 796211712.0, + "step": 1557 + }, + { + "epoch": 0.42130881557598704, + "grad_norm": 4.8125, + "learning_rate": 0.019861729429434552, + "loss": 3.8289, + "mean_token_accuracy": 0.32389602065086365, + "num_tokens": 796735866.0, + "step": 1558 + }, + { + "epoch": 0.4215792320173066, + "grad_norm": 2.203125, + "learning_rate": 0.019861454013190703, + "loss": 3.4574, + "mean_token_accuracy": 0.35771483182907104, + "num_tokens": 797260104.0, + "step": 1559 + }, + { + "epoch": 0.42184964845862627, + "grad_norm": 2.9375, + "learning_rate": 0.019861178325051385, + "loss": 3.3981, + "mean_token_accuracy": 0.36906668543815613, + "num_tokens": 797749971.0, + "step": 1560 + }, + { + "epoch": 0.4221200648999459, + "grad_norm": 2.75, + "learning_rate": 0.019860902365025055, + "loss": 3.5578, + "mean_token_accuracy": 0.36936670541763306, + "num_tokens": 798274250.0, + "step": 1561 + }, + { + "epoch": 0.42239048134126556, + "grad_norm": 3.640625, + "learning_rate": 0.019860626133120184, + "loss": 3.5703, + "mean_token_accuracy": 0.36695706844329834, + "num_tokens": 798759055.0, + "step": 1562 + }, + { + "epoch": 0.4226608977825852, + "grad_norm": 2.4375, + "learning_rate": 0.019860349629345245, + "loss": 3.342, + "mean_token_accuracy": 0.37156879901885986, + "num_tokens": 799283214.0, + "step": 1563 + }, + { + "epoch": 0.4229313142239048, + "grad_norm": 2.75, + "learning_rate": 0.019860072853708725, + "loss": 3.4247, + "mean_token_accuracy": 0.3632257878780365, + "num_tokens": 799807420.0, + "step": 1564 + }, + { + "epoch": 0.42320173066522443, + "grad_norm": 3.265625, + "learning_rate": 0.019859795806219115, + "loss": 3.5867, + "mean_token_accuracy": 0.34248754382133484, + "num_tokens": 800331443.0, + "step": 1565 + }, + { + "epoch": 0.4234721471065441, + "grad_norm": 3.359375, + "learning_rate": 0.01985951848688491, + "loss": 3.4808, + "mean_token_accuracy": 0.35894638299942017, + "num_tokens": 800826848.0, + "step": 1566 + }, + { + "epoch": 0.4237425635478637, + "grad_norm": 4.6875, + "learning_rate": 0.019859240895714628, + "loss": 3.4883, + "mean_token_accuracy": 0.3767983317375183, + "num_tokens": 801339589.0, + "step": 1567 + }, + { + "epoch": 0.42401297998918336, + "grad_norm": 2.34375, + "learning_rate": 0.019858963032716778, + "loss": 3.4053, + "mean_token_accuracy": 0.3771783113479614, + "num_tokens": 801863840.0, + "step": 1568 + }, + { + "epoch": 0.424283396430503, + "grad_norm": 2.921875, + "learning_rate": 0.01985868489789989, + "loss": 3.3812, + "mean_token_accuracy": 0.3606763482093811, + "num_tokens": 802388000.0, + "step": 1569 + }, + { + "epoch": 0.4245538128718226, + "grad_norm": 2.65625, + "learning_rate": 0.019858406491272504, + "loss": 3.2912, + "mean_token_accuracy": 0.38189083337783813, + "num_tokens": 802912122.0, + "step": 1570 + }, + { + "epoch": 0.42482422931314223, + "grad_norm": 40.75, + "learning_rate": 0.01985812781284315, + "loss": 11.1115, + "mean_token_accuracy": 0.007192248944193125, + "num_tokens": 803436325.0, + "step": 1571 + }, + { + "epoch": 0.4250946457544619, + "grad_norm": 10.3125, + "learning_rate": 0.019857848862620386, + "loss": 4.3149, + "mean_token_accuracy": 0.2927183210849762, + "num_tokens": 803958381.0, + "step": 1572 + }, + { + "epoch": 0.4253650621957815, + "grad_norm": 3.3125, + "learning_rate": 0.01985756964061277, + "loss": 3.7698, + "mean_token_accuracy": 0.3533792793750763, + "num_tokens": 804482590.0, + "step": 1573 + }, + { + "epoch": 0.42563547863710116, + "grad_norm": 4.875, + "learning_rate": 0.019857290146828868, + "loss": 3.5321, + "mean_token_accuracy": 0.36078184843063354, + "num_tokens": 805006831.0, + "step": 1574 + }, + { + "epoch": 0.42590589507842075, + "grad_norm": 3.171875, + "learning_rate": 0.019857010381277258, + "loss": 3.5365, + "mean_token_accuracy": 0.356623113155365, + "num_tokens": 805530992.0, + "step": 1575 + }, + { + "epoch": 0.4261763115197404, + "grad_norm": 3.09375, + "learning_rate": 0.01985673034396652, + "loss": 3.7516, + "mean_token_accuracy": 0.33941924571990967, + "num_tokens": 806055221.0, + "step": 1576 + }, + { + "epoch": 0.42644672796106003, + "grad_norm": 2.4375, + "learning_rate": 0.01985645003490525, + "loss": 3.2684, + "mean_token_accuracy": 0.3902556598186493, + "num_tokens": 806546086.0, + "step": 1577 + }, + { + "epoch": 0.4267171444023797, + "grad_norm": 3.203125, + "learning_rate": 0.01985616945410205, + "loss": 3.5276, + "mean_token_accuracy": 0.34269073605537415, + "num_tokens": 807070280.0, + "step": 1578 + }, + { + "epoch": 0.4269875608436993, + "grad_norm": 2.671875, + "learning_rate": 0.01985588860156553, + "loss": 3.3609, + "mean_token_accuracy": 0.34652179479599, + "num_tokens": 807594552.0, + "step": 1579 + }, + { + "epoch": 0.4272579772850189, + "grad_norm": 3.40625, + "learning_rate": 0.019855607477304298, + "loss": 3.4714, + "mean_token_accuracy": 0.3510335683822632, + "num_tokens": 808118775.0, + "step": 1580 + }, + { + "epoch": 0.42752839372633855, + "grad_norm": 3.0625, + "learning_rate": 0.019855326081326986, + "loss": 3.2617, + "mean_token_accuracy": 0.36888977885246277, + "num_tokens": 808642897.0, + "step": 1581 + }, + { + "epoch": 0.4277988101676582, + "grad_norm": 4.8125, + "learning_rate": 0.01985504441364223, + "loss": 3.4316, + "mean_token_accuracy": 0.3657612204551697, + "num_tokens": 809163617.0, + "step": 1582 + }, + { + "epoch": 0.42806922660897784, + "grad_norm": 1.96875, + "learning_rate": 0.01985476247425867, + "loss": 3.447, + "mean_token_accuracy": 0.3652338981628418, + "num_tokens": 809679941.0, + "step": 1583 + }, + { + "epoch": 0.4283396430502975, + "grad_norm": 3.828125, + "learning_rate": 0.019854480263184957, + "loss": 3.49, + "mean_token_accuracy": 0.36409157514572144, + "num_tokens": 810204174.0, + "step": 1584 + }, + { + "epoch": 0.42861005949161707, + "grad_norm": 2.890625, + "learning_rate": 0.019854197780429753, + "loss": 3.5549, + "mean_token_accuracy": 0.3536635637283325, + "num_tokens": 810728437.0, + "step": 1585 + }, + { + "epoch": 0.4288804759329367, + "grad_norm": 2.890625, + "learning_rate": 0.019853915026001718, + "loss": 3.2925, + "mean_token_accuracy": 0.3884139358997345, + "num_tokens": 811239782.0, + "step": 1586 + }, + { + "epoch": 0.42915089237425635, + "grad_norm": 2.875, + "learning_rate": 0.019853631999909534, + "loss": 3.4452, + "mean_token_accuracy": 0.3681899905204773, + "num_tokens": 811750877.0, + "step": 1587 + }, + { + "epoch": 0.429421308815576, + "grad_norm": 3.59375, + "learning_rate": 0.019853348702161887, + "loss": 3.3926, + "mean_token_accuracy": 0.36511552333831787, + "num_tokens": 812275047.0, + "step": 1588 + }, + { + "epoch": 0.42969172525689564, + "grad_norm": 2.5625, + "learning_rate": 0.01985306513276746, + "loss": 3.3965, + "mean_token_accuracy": 0.41047054529190063, + "num_tokens": 812725862.0, + "step": 1589 + }, + { + "epoch": 0.42996214169821523, + "grad_norm": 3.6875, + "learning_rate": 0.019852781291734962, + "loss": 3.5215, + "mean_token_accuracy": 0.3518064022064209, + "num_tokens": 813250029.0, + "step": 1590 + }, + { + "epoch": 0.43023255813953487, + "grad_norm": 0.466796875, + "learning_rate": 0.0198524971790731, + "loss": 11.0922, + "mean_token_accuracy": 1.21711018437054e-05, + "num_tokens": 813774311.0, + "step": 1591 + }, + { + "epoch": 0.4305029745808545, + "grad_norm": 7.28125, + "learning_rate": 0.019852212794790592, + "loss": 4.0232, + "mean_token_accuracy": 0.33668604493141174, + "num_tokens": 814298447.0, + "step": 1592 + }, + { + "epoch": 0.43077339102217416, + "grad_norm": 4.90625, + "learning_rate": 0.019851928138896163, + "loss": 3.5042, + "mean_token_accuracy": 0.37998032569885254, + "num_tokens": 814822662.0, + "step": 1593 + }, + { + "epoch": 0.4310438074634938, + "grad_norm": 3.9375, + "learning_rate": 0.019851643211398543, + "loss": 3.6657, + "mean_token_accuracy": 0.3420161008834839, + "num_tokens": 815346824.0, + "step": 1594 + }, + { + "epoch": 0.4313142239048134, + "grad_norm": 3.359375, + "learning_rate": 0.019851358012306485, + "loss": 3.4432, + "mean_token_accuracy": 0.34816503524780273, + "num_tokens": 815870965.0, + "step": 1595 + }, + { + "epoch": 0.43158464034613303, + "grad_norm": 2.6875, + "learning_rate": 0.01985107254162873, + "loss": 3.3867, + "mean_token_accuracy": 0.39441144466400146, + "num_tokens": 816276308.0, + "step": 1596 + }, + { + "epoch": 0.4318550567874527, + "grad_norm": 4.4375, + "learning_rate": 0.019850786799374038, + "loss": 3.648, + "mean_token_accuracy": 0.3472670316696167, + "num_tokens": 816744648.0, + "step": 1597 + }, + { + "epoch": 0.4321254732287723, + "grad_norm": 3.296875, + "learning_rate": 0.019850500785551176, + "loss": 3.4664, + "mean_token_accuracy": 0.37330254912376404, + "num_tokens": 817214495.0, + "step": 1598 + }, + { + "epoch": 0.43239588967009196, + "grad_norm": 3.609375, + "learning_rate": 0.01985021450016893, + "loss": 3.689, + "mean_token_accuracy": 0.35092663764953613, + "num_tokens": 817738770.0, + "step": 1599 + }, + { + "epoch": 0.43266630611141155, + "grad_norm": 3.375, + "learning_rate": 0.01984992794323607, + "loss": 3.6127, + "mean_token_accuracy": 0.3640325963497162, + "num_tokens": 818262994.0, + "step": 1600 + }, + { + "epoch": 0.4329367225527312, + "grad_norm": 5.09375, + "learning_rate": 0.0198496411147614, + "loss": 3.6542, + "mean_token_accuracy": 0.3303350806236267, + "num_tokens": 818787252.0, + "step": 1601 + }, + { + "epoch": 0.43320713899405083, + "grad_norm": 3.59375, + "learning_rate": 0.019849354014753712, + "loss": 3.5016, + "mean_token_accuracy": 0.37646204233169556, + "num_tokens": 819249822.0, + "step": 1602 + }, + { + "epoch": 0.4334775554353705, + "grad_norm": 3.984375, + "learning_rate": 0.019849066643221815, + "loss": 3.6997, + "mean_token_accuracy": 0.3404659032821655, + "num_tokens": 819773990.0, + "step": 1603 + }, + { + "epoch": 0.4337479718766901, + "grad_norm": 3.453125, + "learning_rate": 0.019848779000174534, + "loss": 3.3131, + "mean_token_accuracy": 0.3570955991744995, + "num_tokens": 820260271.0, + "step": 1604 + }, + { + "epoch": 0.43401838831800976, + "grad_norm": 2.609375, + "learning_rate": 0.019848491085620694, + "loss": 3.5026, + "mean_token_accuracy": 0.3733346164226532, + "num_tokens": 820783436.0, + "step": 1605 + }, + { + "epoch": 0.43428880475932935, + "grad_norm": 3.078125, + "learning_rate": 0.01984820289956912, + "loss": 3.504, + "mean_token_accuracy": 0.3751481771469116, + "num_tokens": 821264671.0, + "step": 1606 + }, + { + "epoch": 0.434559221200649, + "grad_norm": 3.375, + "learning_rate": 0.019847914442028665, + "loss": 3.3611, + "mean_token_accuracy": 0.35533738136291504, + "num_tokens": 821788809.0, + "step": 1607 + }, + { + "epoch": 0.43482963764196864, + "grad_norm": 3.296875, + "learning_rate": 0.019847625713008174, + "loss": 3.6454, + "mean_token_accuracy": 0.3567625880241394, + "num_tokens": 822313048.0, + "step": 1608 + }, + { + "epoch": 0.4351000540832883, + "grad_norm": 3.78125, + "learning_rate": 0.0198473367125165, + "loss": 3.6307, + "mean_token_accuracy": 0.3688212037086487, + "num_tokens": 822807269.0, + "step": 1609 + }, + { + "epoch": 0.4353704705246079, + "grad_norm": 2.734375, + "learning_rate": 0.019847047440562527, + "loss": 3.3733, + "mean_token_accuracy": 0.3904956579208374, + "num_tokens": 823300535.0, + "step": 1610 + }, + { + "epoch": 0.4356408869659275, + "grad_norm": 48.5, + "learning_rate": 0.019846757897155117, + "loss": 31.6384, + "mean_token_accuracy": 0.03748006373643875, + "num_tokens": 823824697.0, + "step": 1611 + }, + { + "epoch": 0.43591130340724715, + "grad_norm": 7.03125, + "learning_rate": 0.019846468082303156, + "loss": 4.034, + "mean_token_accuracy": 0.31029239296913147, + "num_tokens": 824330212.0, + "step": 1612 + }, + { + "epoch": 0.4361817198485668, + "grad_norm": 2.390625, + "learning_rate": 0.01984617799601554, + "loss": 3.6338, + "mean_token_accuracy": 0.3606584370136261, + "num_tokens": 824854425.0, + "step": 1613 + }, + { + "epoch": 0.43645213628988644, + "grad_norm": 2.359375, + "learning_rate": 0.01984588763830117, + "loss": 3.3618, + "mean_token_accuracy": 0.36795544624328613, + "num_tokens": 825378628.0, + "step": 1614 + }, + { + "epoch": 0.4367225527312061, + "grad_norm": 3.109375, + "learning_rate": 0.019845597009168953, + "loss": 3.7097, + "mean_token_accuracy": 0.3922739624977112, + "num_tokens": 825839881.0, + "step": 1615 + }, + { + "epoch": 0.43699296917252567, + "grad_norm": 3.453125, + "learning_rate": 0.019845306108627805, + "loss": 3.4905, + "mean_token_accuracy": 0.3578750491142273, + "num_tokens": 826350719.0, + "step": 1616 + }, + { + "epoch": 0.4372633856138453, + "grad_norm": 2.984375, + "learning_rate": 0.019845014936686653, + "loss": 3.4375, + "mean_token_accuracy": 0.37124019861221313, + "num_tokens": 826874964.0, + "step": 1617 + }, + { + "epoch": 0.43753380205516496, + "grad_norm": 3.0, + "learning_rate": 0.019844723493354432, + "loss": 3.4664, + "mean_token_accuracy": 0.37647342681884766, + "num_tokens": 827362375.0, + "step": 1618 + }, + { + "epoch": 0.4378042184964846, + "grad_norm": 2.515625, + "learning_rate": 0.019844431778640086, + "loss": 3.2556, + "mean_token_accuracy": 0.3789980411529541, + "num_tokens": 827886544.0, + "step": 1619 + }, + { + "epoch": 0.43807463493780424, + "grad_norm": 4.0, + "learning_rate": 0.01984413979255256, + "loss": 3.4433, + "mean_token_accuracy": 0.3631853759288788, + "num_tokens": 828410775.0, + "step": 1620 + }, + { + "epoch": 0.43834505137912383, + "grad_norm": 2.71875, + "learning_rate": 0.019843847535100816, + "loss": 3.3869, + "mean_token_accuracy": 0.3555943965911865, + "num_tokens": 828934912.0, + "step": 1621 + }, + { + "epoch": 0.43861546782044347, + "grad_norm": 4.28125, + "learning_rate": 0.019843555006293822, + "loss": 3.3493, + "mean_token_accuracy": 0.3947744369506836, + "num_tokens": 829394947.0, + "step": 1622 + }, + { + "epoch": 0.4388858842617631, + "grad_norm": 3.078125, + "learning_rate": 0.019843262206140556, + "loss": 3.4506, + "mean_token_accuracy": 0.39684024453163147, + "num_tokens": 829854931.0, + "step": 1623 + }, + { + "epoch": 0.43915630070308276, + "grad_norm": 3.40625, + "learning_rate": 0.019842969134649995, + "loss": 3.6234, + "mean_token_accuracy": 0.36705708503723145, + "num_tokens": 830379078.0, + "step": 1624 + }, + { + "epoch": 0.4394267171444024, + "grad_norm": 3.625, + "learning_rate": 0.01984267579183114, + "loss": 3.6455, + "mean_token_accuracy": 0.3586854636669159, + "num_tokens": 830850478.0, + "step": 1625 + }, + { + "epoch": 0.439697133585722, + "grad_norm": 3.21875, + "learning_rate": 0.019842382177692984, + "loss": 3.3551, + "mean_token_accuracy": 0.39802074432373047, + "num_tokens": 831311099.0, + "step": 1626 + }, + { + "epoch": 0.43996755002704163, + "grad_norm": 4.0, + "learning_rate": 0.01984208829224454, + "loss": 3.6883, + "mean_token_accuracy": 0.34819939732551575, + "num_tokens": 831835256.0, + "step": 1627 + }, + { + "epoch": 0.4402379664683613, + "grad_norm": 3.171875, + "learning_rate": 0.019841794135494823, + "loss": 3.2718, + "mean_token_accuracy": 0.3779810965061188, + "num_tokens": 832321299.0, + "step": 1628 + }, + { + "epoch": 0.4405083829096809, + "grad_norm": 3.1875, + "learning_rate": 0.01984149970745286, + "loss": 3.5774, + "mean_token_accuracy": 0.34361448884010315, + "num_tokens": 832845503.0, + "step": 1629 + }, + { + "epoch": 0.44077879935100056, + "grad_norm": 3.609375, + "learning_rate": 0.019841205008127687, + "loss": 3.6828, + "mean_token_accuracy": 0.352791428565979, + "num_tokens": 833369590.0, + "step": 1630 + }, + { + "epoch": 0.44104921579232015, + "grad_norm": 628.0, + "learning_rate": 0.01984091003752834, + "loss": 20.5923, + "mean_token_accuracy": 0.040607377886772156, + "num_tokens": 833893791.0, + "step": 1631 + }, + { + "epoch": 0.4413196322336398, + "grad_norm": 7.53125, + "learning_rate": 0.019840614795663878, + "loss": 3.7563, + "mean_token_accuracy": 0.34289711713790894, + "num_tokens": 834418025.0, + "step": 1632 + }, + { + "epoch": 0.44159004867495943, + "grad_norm": 3.171875, + "learning_rate": 0.019840319282543352, + "loss": 3.5659, + "mean_token_accuracy": 0.34517043828964233, + "num_tokens": 834942185.0, + "step": 1633 + }, + { + "epoch": 0.4418604651162791, + "grad_norm": 3.359375, + "learning_rate": 0.019840023498175836, + "loss": 3.5613, + "mean_token_accuracy": 0.3461275100708008, + "num_tokens": 835466209.0, + "step": 1634 + }, + { + "epoch": 0.4421308815575987, + "grad_norm": 3.015625, + "learning_rate": 0.019839727442570402, + "loss": 3.5313, + "mean_token_accuracy": 0.3604823052883148, + "num_tokens": 835990422.0, + "step": 1635 + }, + { + "epoch": 0.4424012979989183, + "grad_norm": 2.890625, + "learning_rate": 0.01983943111573613, + "loss": 3.4164, + "mean_token_accuracy": 0.3613922595977783, + "num_tokens": 836514607.0, + "step": 1636 + }, + { + "epoch": 0.44267171444023795, + "grad_norm": 3.1875, + "learning_rate": 0.019839134517682124, + "loss": 3.4717, + "mean_token_accuracy": 0.36686933040618896, + "num_tokens": 837038843.0, + "step": 1637 + }, + { + "epoch": 0.4429421308815576, + "grad_norm": 2.96875, + "learning_rate": 0.01983883764841747, + "loss": 3.3497, + "mean_token_accuracy": 0.38434088230133057, + "num_tokens": 837563056.0, + "step": 1638 + }, + { + "epoch": 0.44321254732287724, + "grad_norm": 2.453125, + "learning_rate": 0.019838540507951283, + "loss": 3.3649, + "mean_token_accuracy": 0.38213208317756653, + "num_tokens": 838087139.0, + "step": 1639 + }, + { + "epoch": 0.4434829637641969, + "grad_norm": 3.359375, + "learning_rate": 0.019838243096292684, + "loss": 3.4143, + "mean_token_accuracy": 0.3468073904514313, + "num_tokens": 838611320.0, + "step": 1640 + }, + { + "epoch": 0.44375338020551647, + "grad_norm": 2.203125, + "learning_rate": 0.019837945413450794, + "loss": 3.3741, + "mean_token_accuracy": 0.41013461351394653, + "num_tokens": 839052909.0, + "step": 1641 + }, + { + "epoch": 0.4440237966468361, + "grad_norm": 3.34375, + "learning_rate": 0.01983764745943475, + "loss": 3.6862, + "mean_token_accuracy": 0.34594327211380005, + "num_tokens": 839577094.0, + "step": 1642 + }, + { + "epoch": 0.44429421308815575, + "grad_norm": 6.03125, + "learning_rate": 0.019837349234253693, + "loss": 3.1924, + "mean_token_accuracy": 0.39323198795318604, + "num_tokens": 840101349.0, + "step": 1643 + }, + { + "epoch": 0.4445646295294754, + "grad_norm": 2.21875, + "learning_rate": 0.019837050737916768, + "loss": 3.3771, + "mean_token_accuracy": 0.37082356214523315, + "num_tokens": 840599141.0, + "step": 1644 + }, + { + "epoch": 0.44483504597079504, + "grad_norm": 3.625, + "learning_rate": 0.01983675197043314, + "loss": 3.1797, + "mean_token_accuracy": 0.3696306347846985, + "num_tokens": 841123308.0, + "step": 1645 + }, + { + "epoch": 0.4451054624121147, + "grad_norm": 2.21875, + "learning_rate": 0.019836452931811976, + "loss": 3.4027, + "mean_token_accuracy": 0.38976573944091797, + "num_tokens": 841593267.0, + "step": 1646 + }, + { + "epoch": 0.44537587885343427, + "grad_norm": 2.78125, + "learning_rate": 0.01983615362206245, + "loss": 3.2916, + "mean_token_accuracy": 0.3565780520439148, + "num_tokens": 842117359.0, + "step": 1647 + }, + { + "epoch": 0.4456462952947539, + "grad_norm": 2.328125, + "learning_rate": 0.019835854041193743, + "loss": 3.3006, + "mean_token_accuracy": 0.38226160407066345, + "num_tokens": 842616464.0, + "step": 1648 + }, + { + "epoch": 0.44591671173607356, + "grad_norm": 2.875, + "learning_rate": 0.019835554189215048, + "loss": 3.3272, + "mean_token_accuracy": 0.36526086926460266, + "num_tokens": 843140523.0, + "step": 1649 + }, + { + "epoch": 0.4461871281773932, + "grad_norm": 2.359375, + "learning_rate": 0.019835254066135573, + "loss": 3.2505, + "mean_token_accuracy": 0.3782128095626831, + "num_tokens": 843664774.0, + "step": 1650 + }, + { + "epoch": 0.44645754461871284, + "grad_norm": 2.078125, + "learning_rate": 0.019834953671964514, + "loss": 11.2267, + "mean_token_accuracy": 5.488956048793625e-06, + "num_tokens": 844188976.0, + "step": 1651 + }, + { + "epoch": 0.44672796106003243, + "grad_norm": 8.6875, + "learning_rate": 0.0198346530067111, + "loss": 4.2633, + "mean_token_accuracy": 0.24534830451011658, + "num_tokens": 844713232.0, + "step": 1652 + }, + { + "epoch": 0.4469983775013521, + "grad_norm": 2.75, + "learning_rate": 0.01983435207038455, + "loss": 3.4383, + "mean_token_accuracy": 0.34829649329185486, + "num_tokens": 845211689.0, + "step": 1653 + }, + { + "epoch": 0.4472687939426717, + "grad_norm": 3.65625, + "learning_rate": 0.019834050862994092, + "loss": 3.5738, + "mean_token_accuracy": 0.356248140335083, + "num_tokens": 845735952.0, + "step": 1654 + }, + { + "epoch": 0.44753921038399136, + "grad_norm": 3.640625, + "learning_rate": 0.01983374938454898, + "loss": 3.4363, + "mean_token_accuracy": 0.3208945393562317, + "num_tokens": 846260101.0, + "step": 1655 + }, + { + "epoch": 0.447809626825311, + "grad_norm": 2.6875, + "learning_rate": 0.019833447635058458, + "loss": 3.6013, + "mean_token_accuracy": 0.36436372995376587, + "num_tokens": 846784315.0, + "step": 1656 + }, + { + "epoch": 0.4480800432666306, + "grad_norm": 3.75, + "learning_rate": 0.019833145614531783, + "loss": 3.4837, + "mean_token_accuracy": 0.3555912971496582, + "num_tokens": 847308511.0, + "step": 1657 + }, + { + "epoch": 0.44835045970795023, + "grad_norm": 3.28125, + "learning_rate": 0.019832843322978223, + "loss": 3.1707, + "mean_token_accuracy": 0.3805667459964752, + "num_tokens": 847832738.0, + "step": 1658 + }, + { + "epoch": 0.4486208761492699, + "grad_norm": 3.140625, + "learning_rate": 0.01983254076040706, + "loss": 3.4124, + "mean_token_accuracy": 0.35256272554397583, + "num_tokens": 848357003.0, + "step": 1659 + }, + { + "epoch": 0.4488912925905895, + "grad_norm": 3.34375, + "learning_rate": 0.01983223792682757, + "loss": 3.514, + "mean_token_accuracy": 0.37265545129776, + "num_tokens": 848881261.0, + "step": 1660 + }, + { + "epoch": 0.44916170903190916, + "grad_norm": 10.625, + "learning_rate": 0.019831934822249042, + "loss": 3.6388, + "mean_token_accuracy": 0.36395561695098877, + "num_tokens": 849389217.0, + "step": 1661 + }, + { + "epoch": 0.44943212547322875, + "grad_norm": 2.890625, + "learning_rate": 0.019831631446680785, + "loss": 3.5341, + "mean_token_accuracy": 0.3607974052429199, + "num_tokens": 849913416.0, + "step": 1662 + }, + { + "epoch": 0.4497025419145484, + "grad_norm": 5.21875, + "learning_rate": 0.0198313278001321, + "loss": 3.4023, + "mean_token_accuracy": 0.379169225692749, + "num_tokens": 850437686.0, + "step": 1663 + }, + { + "epoch": 0.44997295835586804, + "grad_norm": 3.421875, + "learning_rate": 0.01983102388261231, + "loss": 3.7204, + "mean_token_accuracy": 0.334351122379303, + "num_tokens": 850961860.0, + "step": 1664 + }, + { + "epoch": 0.4502433747971877, + "grad_norm": 4.09375, + "learning_rate": 0.019830719694130732, + "loss": 3.7368, + "mean_token_accuracy": 0.3455047607421875, + "num_tokens": 851424259.0, + "step": 1665 + }, + { + "epoch": 0.4505137912385073, + "grad_norm": 2.53125, + "learning_rate": 0.019830415234696707, + "loss": 3.0135, + "mean_token_accuracy": 0.38640260696411133, + "num_tokens": 851941252.0, + "step": 1666 + }, + { + "epoch": 0.4507842076798269, + "grad_norm": 3.140625, + "learning_rate": 0.019830110504319574, + "loss": 3.251, + "mean_token_accuracy": 0.37877464294433594, + "num_tokens": 852465482.0, + "step": 1667 + }, + { + "epoch": 0.45105462412114655, + "grad_norm": 3.0625, + "learning_rate": 0.019829805503008682, + "loss": 3.1446, + "mean_token_accuracy": 0.3731510639190674, + "num_tokens": 852989671.0, + "step": 1668 + }, + { + "epoch": 0.4513250405624662, + "grad_norm": 3.625, + "learning_rate": 0.019829500230773393, + "loss": 3.5807, + "mean_token_accuracy": 0.354780375957489, + "num_tokens": 853455020.0, + "step": 1669 + }, + { + "epoch": 0.45159545700378584, + "grad_norm": 2.546875, + "learning_rate": 0.01982919468762307, + "loss": 3.3604, + "mean_token_accuracy": 0.36785411834716797, + "num_tokens": 853979267.0, + "step": 1670 + }, + { + "epoch": 0.4518658734451055, + "grad_norm": 23.75, + "learning_rate": 0.01982888887356709, + "loss": 12.4877, + "mean_token_accuracy": 0.0, + "num_tokens": 854503471.0, + "step": 1671 + }, + { + "epoch": 0.45213628988642507, + "grad_norm": 7.71875, + "learning_rate": 0.019828582788614834, + "loss": 4.1642, + "mean_token_accuracy": 0.2477474957704544, + "num_tokens": 855027517.0, + "step": 1672 + }, + { + "epoch": 0.4524067063277447, + "grad_norm": 2.984375, + "learning_rate": 0.019828276432775693, + "loss": 3.7384, + "mean_token_accuracy": 0.3281060457229614, + "num_tokens": 855517601.0, + "step": 1673 + }, + { + "epoch": 0.45267712276906436, + "grad_norm": 2.828125, + "learning_rate": 0.019827969806059074, + "loss": 3.4768, + "mean_token_accuracy": 0.3592349588871002, + "num_tokens": 855986416.0, + "step": 1674 + }, + { + "epoch": 0.452947539210384, + "grad_norm": 3.328125, + "learning_rate": 0.019827662908474375, + "loss": 3.3432, + "mean_token_accuracy": 0.3724978566169739, + "num_tokens": 856510650.0, + "step": 1675 + }, + { + "epoch": 0.45321795565170364, + "grad_norm": 3.640625, + "learning_rate": 0.01982735574003102, + "loss": 3.4996, + "mean_token_accuracy": 0.3593195974826813, + "num_tokens": 857029768.0, + "step": 1676 + }, + { + "epoch": 0.45348837209302323, + "grad_norm": 2.59375, + "learning_rate": 0.01982704830073843, + "loss": 3.5171, + "mean_token_accuracy": 0.3606383204460144, + "num_tokens": 857553971.0, + "step": 1677 + }, + { + "epoch": 0.45375878853434287, + "grad_norm": 3.171875, + "learning_rate": 0.019826740590606042, + "loss": 3.3659, + "mean_token_accuracy": 0.368857204914093, + "num_tokens": 858078245.0, + "step": 1678 + }, + { + "epoch": 0.4540292049756625, + "grad_norm": 3.09375, + "learning_rate": 0.019826432609643292, + "loss": 3.4993, + "mean_token_accuracy": 0.3794770836830139, + "num_tokens": 858579976.0, + "step": 1679 + }, + { + "epoch": 0.45429962141698216, + "grad_norm": 4.03125, + "learning_rate": 0.019826124357859635, + "loss": 3.4805, + "mean_token_accuracy": 0.35857492685317993, + "num_tokens": 859104168.0, + "step": 1680 + }, + { + "epoch": 0.4545700378583018, + "grad_norm": 2.765625, + "learning_rate": 0.01982581583526452, + "loss": 3.2264, + "mean_token_accuracy": 0.3718610405921936, + "num_tokens": 859592696.0, + "step": 1681 + }, + { + "epoch": 0.45484045429962144, + "grad_norm": 3.75, + "learning_rate": 0.01982550704186743, + "loss": 3.6234, + "mean_token_accuracy": 0.3305903375148773, + "num_tokens": 860085234.0, + "step": 1682 + }, + { + "epoch": 0.45511087074094103, + "grad_norm": 2.703125, + "learning_rate": 0.019825197977677823, + "loss": 3.2999, + "mean_token_accuracy": 0.37177783250808716, + "num_tokens": 860609390.0, + "step": 1683 + }, + { + "epoch": 0.4553812871822607, + "grad_norm": 3.953125, + "learning_rate": 0.019824888642705193, + "loss": 3.5949, + "mean_token_accuracy": 0.3574983477592468, + "num_tokens": 861132478.0, + "step": 1684 + }, + { + "epoch": 0.4556517036235803, + "grad_norm": 3.875, + "learning_rate": 0.019824579036959025, + "loss": 3.7333, + "mean_token_accuracy": 0.36286449432373047, + "num_tokens": 861656611.0, + "step": 1685 + }, + { + "epoch": 0.45592212006489996, + "grad_norm": 3.296875, + "learning_rate": 0.01982426916044882, + "loss": 3.3988, + "mean_token_accuracy": 0.3481968641281128, + "num_tokens": 862180885.0, + "step": 1686 + }, + { + "epoch": 0.4561925365062196, + "grad_norm": 2.578125, + "learning_rate": 0.01982395901318409, + "loss": 3.523, + "mean_token_accuracy": 0.34986233711242676, + "num_tokens": 862704963.0, + "step": 1687 + }, + { + "epoch": 0.4564629529475392, + "grad_norm": 2.9375, + "learning_rate": 0.019823648595174343, + "loss": 3.3933, + "mean_token_accuracy": 0.35712936520576477, + "num_tokens": 863229081.0, + "step": 1688 + }, + { + "epoch": 0.45673336938885883, + "grad_norm": 3.25, + "learning_rate": 0.019823337906429112, + "loss": 3.7007, + "mean_token_accuracy": 0.3445402681827545, + "num_tokens": 863753352.0, + "step": 1689 + }, + { + "epoch": 0.4570037858301785, + "grad_norm": 2.90625, + "learning_rate": 0.019823026946957926, + "loss": 3.1983, + "mean_token_accuracy": 0.39377349615097046, + "num_tokens": 864197221.0, + "step": 1690 + }, + { + "epoch": 0.4572742022714981, + "grad_norm": 33.25, + "learning_rate": 0.01982271571677033, + "loss": 14.3377, + "mean_token_accuracy": 0.009596459567546844, + "num_tokens": 864721369.0, + "step": 1691 + }, + { + "epoch": 0.45754461871281776, + "grad_norm": 5.71875, + "learning_rate": 0.01982240421587586, + "loss": 4.0545, + "mean_token_accuracy": 0.30463021993637085, + "num_tokens": 865190014.0, + "step": 1692 + }, + { + "epoch": 0.45781503515413735, + "grad_norm": 2.296875, + "learning_rate": 0.01982209244428409, + "loss": 3.669, + "mean_token_accuracy": 0.34585869312286377, + "num_tokens": 865714197.0, + "step": 1693 + }, + { + "epoch": 0.458085451595457, + "grad_norm": 2.515625, + "learning_rate": 0.01982178040200458, + "loss": 3.2826, + "mean_token_accuracy": 0.3869476616382599, + "num_tokens": 866181237.0, + "step": 1694 + }, + { + "epoch": 0.45835586803677664, + "grad_norm": 3.234375, + "learning_rate": 0.019821468089046904, + "loss": 3.6827, + "mean_token_accuracy": 0.3577444553375244, + "num_tokens": 866705513.0, + "step": 1695 + }, + { + "epoch": 0.4586262844780963, + "grad_norm": 3.109375, + "learning_rate": 0.019821155505420643, + "loss": 3.3081, + "mean_token_accuracy": 0.37024110555648804, + "num_tokens": 867229708.0, + "step": 1696 + }, + { + "epoch": 0.4588967009194159, + "grad_norm": 2.859375, + "learning_rate": 0.019820842651135394, + "loss": 3.3013, + "mean_token_accuracy": 0.3839324414730072, + "num_tokens": 867753924.0, + "step": 1697 + }, + { + "epoch": 0.4591671173607355, + "grad_norm": 3.765625, + "learning_rate": 0.01982052952620075, + "loss": 3.6595, + "mean_token_accuracy": 0.33300721645355225, + "num_tokens": 868278050.0, + "step": 1698 + }, + { + "epoch": 0.45943753380205515, + "grad_norm": 3.265625, + "learning_rate": 0.019820216130626317, + "loss": 3.2572, + "mean_token_accuracy": 0.3843660056591034, + "num_tokens": 868802330.0, + "step": 1699 + }, + { + "epoch": 0.4597079502433748, + "grad_norm": 4.0, + "learning_rate": 0.019819902464421717, + "loss": 3.5953, + "mean_token_accuracy": 0.3714136779308319, + "num_tokens": 869270808.0, + "step": 1700 + }, + { + "epoch": 0.45997836668469444, + "grad_norm": 3.421875, + "learning_rate": 0.01981958852759657, + "loss": 3.6094, + "mean_token_accuracy": 0.34295108914375305, + "num_tokens": 869795019.0, + "step": 1701 + }, + { + "epoch": 0.4602487831260141, + "grad_norm": 2.53125, + "learning_rate": 0.01981927432016051, + "loss": 3.3864, + "mean_token_accuracy": 0.40298017859458923, + "num_tokens": 870259295.0, + "step": 1702 + }, + { + "epoch": 0.46051919956733367, + "grad_norm": 3.375, + "learning_rate": 0.019818959842123184, + "loss": 3.6626, + "mean_token_accuracy": 0.35923194885253906, + "num_tokens": 870744432.0, + "step": 1703 + }, + { + "epoch": 0.4607896160086533, + "grad_norm": 2.796875, + "learning_rate": 0.01981864509349423, + "loss": 3.4526, + "mean_token_accuracy": 0.3656061887741089, + "num_tokens": 871268697.0, + "step": 1704 + }, + { + "epoch": 0.46106003244997296, + "grad_norm": 2.875, + "learning_rate": 0.019818330074283314, + "loss": 3.5559, + "mean_token_accuracy": 0.37124043703079224, + "num_tokens": 871792840.0, + "step": 1705 + }, + { + "epoch": 0.4613304488912926, + "grad_norm": 2.671875, + "learning_rate": 0.019818014784500095, + "loss": 3.2906, + "mean_token_accuracy": 0.38858962059020996, + "num_tokens": 872304735.0, + "step": 1706 + }, + { + "epoch": 0.46160086533261224, + "grad_norm": 3.453125, + "learning_rate": 0.019817699224154252, + "loss": 3.3103, + "mean_token_accuracy": 0.3634117841720581, + "num_tokens": 872828997.0, + "step": 1707 + }, + { + "epoch": 0.46187128177393183, + "grad_norm": 2.9375, + "learning_rate": 0.019817383393255464, + "loss": 3.3499, + "mean_token_accuracy": 0.3761640191078186, + "num_tokens": 873353186.0, + "step": 1708 + }, + { + "epoch": 0.4621416982152515, + "grad_norm": 3.0, + "learning_rate": 0.019817067291813425, + "loss": 3.4281, + "mean_token_accuracy": 0.34893667697906494, + "num_tokens": 873877459.0, + "step": 1709 + }, + { + "epoch": 0.4624121146565711, + "grad_norm": 3.390625, + "learning_rate": 0.019816750919837835, + "loss": 3.5113, + "mean_token_accuracy": 0.3725959062576294, + "num_tokens": 874401646.0, + "step": 1710 + }, + { + "epoch": 0.46268253109789076, + "grad_norm": 0.51171875, + "learning_rate": 0.019816434277338393, + "loss": 11.0856, + "mean_token_accuracy": 9.99705935100792e-06, + "num_tokens": 874925918.0, + "step": 1711 + }, + { + "epoch": 0.4629529475392104, + "grad_norm": 8.3125, + "learning_rate": 0.01981611736432482, + "loss": 4.2965, + "mean_token_accuracy": 0.2650476396083832, + "num_tokens": 875394481.0, + "step": 1712 + }, + { + "epoch": 0.46322336398053, + "grad_norm": 3.28125, + "learning_rate": 0.019815800180806843, + "loss": 3.513, + "mean_token_accuracy": 0.3389799892902374, + "num_tokens": 875918726.0, + "step": 1713 + }, + { + "epoch": 0.46349378042184963, + "grad_norm": 2.859375, + "learning_rate": 0.01981548272679419, + "loss": 3.4552, + "mean_token_accuracy": 0.35726398229599, + "num_tokens": 876442823.0, + "step": 1714 + }, + { + "epoch": 0.4637641968631693, + "grad_norm": 5.03125, + "learning_rate": 0.019815165002296603, + "loss": 3.5795, + "mean_token_accuracy": 0.3534772992134094, + "num_tokens": 876913999.0, + "step": 1715 + }, + { + "epoch": 0.4640346133044889, + "grad_norm": 2.59375, + "learning_rate": 0.01981484700732383, + "loss": 3.5386, + "mean_token_accuracy": 0.3769341707229614, + "num_tokens": 877380345.0, + "step": 1716 + }, + { + "epoch": 0.46430502974580856, + "grad_norm": 4.125, + "learning_rate": 0.019814528741885624, + "loss": 3.8245, + "mean_token_accuracy": 0.32663634419441223, + "num_tokens": 877904622.0, + "step": 1717 + }, + { + "epoch": 0.4645754461871282, + "grad_norm": 2.59375, + "learning_rate": 0.019814210205991757, + "loss": 3.4391, + "mean_token_accuracy": 0.3669121265411377, + "num_tokens": 878428863.0, + "step": 1718 + }, + { + "epoch": 0.4648458626284478, + "grad_norm": 3.828125, + "learning_rate": 0.019813891399652, + "loss": 3.4362, + "mean_token_accuracy": 0.3677617907524109, + "num_tokens": 878920005.0, + "step": 1719 + }, + { + "epoch": 0.46511627906976744, + "grad_norm": 8.1875, + "learning_rate": 0.019813572322876134, + "loss": 3.4034, + "mean_token_accuracy": 0.35246893763542175, + "num_tokens": 879444057.0, + "step": 1720 + }, + { + "epoch": 0.4653866955110871, + "grad_norm": 2.25, + "learning_rate": 0.01981325297567395, + "loss": 3.5155, + "mean_token_accuracy": 0.336559534072876, + "num_tokens": 879968067.0, + "step": 1721 + }, + { + "epoch": 0.4656571119524067, + "grad_norm": 3.21875, + "learning_rate": 0.01981293335805525, + "loss": 3.4088, + "mean_token_accuracy": 0.37710052728652954, + "num_tokens": 880492341.0, + "step": 1722 + }, + { + "epoch": 0.46592752839372636, + "grad_norm": 3.359375, + "learning_rate": 0.019812613470029834, + "loss": 3.5158, + "mean_token_accuracy": 0.35190102458000183, + "num_tokens": 880989764.0, + "step": 1723 + }, + { + "epoch": 0.46619794483504595, + "grad_norm": 3.8125, + "learning_rate": 0.019812293311607523, + "loss": 3.2438, + "mean_token_accuracy": 0.36337602138519287, + "num_tokens": 881514002.0, + "step": 1724 + }, + { + "epoch": 0.4664683612763656, + "grad_norm": 2.9375, + "learning_rate": 0.019811972882798134, + "loss": 3.4319, + "mean_token_accuracy": 0.37600719928741455, + "num_tokens": 881980682.0, + "step": 1725 + }, + { + "epoch": 0.46673877771768524, + "grad_norm": 3.9375, + "learning_rate": 0.019811652183611507, + "loss": 3.5716, + "mean_token_accuracy": 0.3365209102630615, + "num_tokens": 882504787.0, + "step": 1726 + }, + { + "epoch": 0.4670091941590049, + "grad_norm": 3.046875, + "learning_rate": 0.019811331214057478, + "loss": 3.4476, + "mean_token_accuracy": 0.3720388114452362, + "num_tokens": 883028880.0, + "step": 1727 + }, + { + "epoch": 0.4672796106003245, + "grad_norm": 3.9375, + "learning_rate": 0.019811009974145893, + "loss": 3.7345, + "mean_token_accuracy": 0.3553517460823059, + "num_tokens": 883472462.0, + "step": 1728 + }, + { + "epoch": 0.4675500270416441, + "grad_norm": 3.765625, + "learning_rate": 0.01981068846388661, + "loss": 3.5249, + "mean_token_accuracy": 0.3644287586212158, + "num_tokens": 883996735.0, + "step": 1729 + }, + { + "epoch": 0.46782044348296375, + "grad_norm": 3.0625, + "learning_rate": 0.019810366683289497, + "loss": 3.1648, + "mean_token_accuracy": 0.3963382840156555, + "num_tokens": 884520828.0, + "step": 1730 + }, + { + "epoch": 0.4680908599242834, + "grad_norm": 4.125, + "learning_rate": 0.01981004463236442, + "loss": 10.1906, + "mean_token_accuracy": 0.0003295106580480933, + "num_tokens": 884993471.0, + "step": 1731 + }, + { + "epoch": 0.46836127636560304, + "grad_norm": 6.96875, + "learning_rate": 0.019809722311121266, + "loss": 4.3465, + "mean_token_accuracy": 0.2589428722858429, + "num_tokens": 885513624.0, + "step": 1732 + }, + { + "epoch": 0.4686316928069227, + "grad_norm": 2.703125, + "learning_rate": 0.019809399719569926, + "loss": 3.6631, + "mean_token_accuracy": 0.3460400402545929, + "num_tokens": 885983424.0, + "step": 1733 + }, + { + "epoch": 0.46890210924824227, + "grad_norm": 2.953125, + "learning_rate": 0.019809076857720296, + "loss": 3.5414, + "mean_token_accuracy": 0.3618663549423218, + "num_tokens": 886507706.0, + "step": 1734 + }, + { + "epoch": 0.4691725256895619, + "grad_norm": 3.3125, + "learning_rate": 0.01980875372558228, + "loss": 3.5805, + "mean_token_accuracy": 0.36715835332870483, + "num_tokens": 887001942.0, + "step": 1735 + }, + { + "epoch": 0.46944294213088156, + "grad_norm": 3.515625, + "learning_rate": 0.019808430323165795, + "loss": 3.5447, + "mean_token_accuracy": 0.35880857706069946, + "num_tokens": 887526085.0, + "step": 1736 + }, + { + "epoch": 0.4697133585722012, + "grad_norm": 2.65625, + "learning_rate": 0.019808106650480765, + "loss": 3.1906, + "mean_token_accuracy": 0.38787728548049927, + "num_tokens": 888004814.0, + "step": 1737 + }, + { + "epoch": 0.46998377501352084, + "grad_norm": 4.40625, + "learning_rate": 0.019807782707537122, + "loss": 3.3229, + "mean_token_accuracy": 0.36902278661727905, + "num_tokens": 888529071.0, + "step": 1738 + }, + { + "epoch": 0.47025419145484043, + "grad_norm": 2.75, + "learning_rate": 0.0198074584943448, + "loss": 3.5693, + "mean_token_accuracy": 0.3591044843196869, + "num_tokens": 889053269.0, + "step": 1739 + }, + { + "epoch": 0.4705246078961601, + "grad_norm": 3.03125, + "learning_rate": 0.019807134010913753, + "loss": 3.1832, + "mean_token_accuracy": 0.38186371326446533, + "num_tokens": 889577394.0, + "step": 1740 + }, + { + "epoch": 0.4707950243374797, + "grad_norm": 2.9375, + "learning_rate": 0.01980680925725393, + "loss": 3.5607, + "mean_token_accuracy": 0.328410267829895, + "num_tokens": 890101613.0, + "step": 1741 + }, + { + "epoch": 0.47106544077879936, + "grad_norm": 3.171875, + "learning_rate": 0.019806484233375304, + "loss": 3.3291, + "mean_token_accuracy": 0.3768472373485565, + "num_tokens": 890625787.0, + "step": 1742 + }, + { + "epoch": 0.471335857220119, + "grad_norm": 3.671875, + "learning_rate": 0.019806158939287843, + "loss": 3.646, + "mean_token_accuracy": 0.34793686866760254, + "num_tokens": 891138141.0, + "step": 1743 + }, + { + "epoch": 0.4716062736614386, + "grad_norm": 2.84375, + "learning_rate": 0.019805833375001528, + "loss": 3.4011, + "mean_token_accuracy": 0.3449268937110901, + "num_tokens": 891662412.0, + "step": 1744 + }, + { + "epoch": 0.47187669010275823, + "grad_norm": 3.875, + "learning_rate": 0.01980550754052635, + "loss": 3.3821, + "mean_token_accuracy": 0.3719683885574341, + "num_tokens": 892159381.0, + "step": 1745 + }, + { + "epoch": 0.4721471065440779, + "grad_norm": 3.28125, + "learning_rate": 0.019805181435872305, + "loss": 3.2968, + "mean_token_accuracy": 0.40677595138549805, + "num_tokens": 892646855.0, + "step": 1746 + }, + { + "epoch": 0.4724175229853975, + "grad_norm": 3.859375, + "learning_rate": 0.019804855061049394, + "loss": 3.6886, + "mean_token_accuracy": 0.3679199814796448, + "num_tokens": 893171064.0, + "step": 1747 + }, + { + "epoch": 0.47268793942671716, + "grad_norm": 3.828125, + "learning_rate": 0.019804528416067647, + "loss": 3.5756, + "mean_token_accuracy": 0.3563568592071533, + "num_tokens": 893625723.0, + "step": 1748 + }, + { + "epoch": 0.47295835586803675, + "grad_norm": 3.390625, + "learning_rate": 0.019804201500937067, + "loss": 3.4806, + "mean_token_accuracy": 0.3801092803478241, + "num_tokens": 894146303.0, + "step": 1749 + }, + { + "epoch": 0.4732287723093564, + "grad_norm": 3.375, + "learning_rate": 0.019803874315667696, + "loss": 3.4982, + "mean_token_accuracy": 0.3544697165489197, + "num_tokens": 894608531.0, + "step": 1750 + }, + { + "epoch": 0.47349918875067604, + "grad_norm": 30.375, + "learning_rate": 0.01980354686026957, + "loss": 11.2158, + "mean_token_accuracy": 0.001940581132657826, + "num_tokens": 895132787.0, + "step": 1751 + }, + { + "epoch": 0.4737696051919957, + "grad_norm": 10.1875, + "learning_rate": 0.019803219134752737, + "loss": 3.9685, + "mean_token_accuracy": 0.3015158772468567, + "num_tokens": 895656923.0, + "step": 1752 + }, + { + "epoch": 0.4740400216333153, + "grad_norm": 3.0625, + "learning_rate": 0.01980289113912725, + "loss": 3.7796, + "mean_token_accuracy": 0.3419232964515686, + "num_tokens": 896181085.0, + "step": 1753 + }, + { + "epoch": 0.4743104380746349, + "grad_norm": 2.765625, + "learning_rate": 0.01980256287340318, + "loss": 3.5251, + "mean_token_accuracy": 0.3594159483909607, + "num_tokens": 896705320.0, + "step": 1754 + }, + { + "epoch": 0.47458085451595455, + "grad_norm": 3.453125, + "learning_rate": 0.019802234337590593, + "loss": 3.4355, + "mean_token_accuracy": 0.3690674901008606, + "num_tokens": 897229558.0, + "step": 1755 + }, + { + "epoch": 0.4748512709572742, + "grad_norm": 1.9765625, + "learning_rate": 0.01980190553169957, + "loss": 3.6167, + "mean_token_accuracy": 0.3646950423717499, + "num_tokens": 897730732.0, + "step": 1756 + }, + { + "epoch": 0.47512168739859384, + "grad_norm": 2.78125, + "learning_rate": 0.019801576455740204, + "loss": 3.5623, + "mean_token_accuracy": 0.3523849844932556, + "num_tokens": 898254996.0, + "step": 1757 + }, + { + "epoch": 0.4753921038399135, + "grad_norm": 2.28125, + "learning_rate": 0.019801247109722583, + "loss": 3.4817, + "mean_token_accuracy": 0.3680952191352844, + "num_tokens": 898779202.0, + "step": 1758 + }, + { + "epoch": 0.4756625202812331, + "grad_norm": 3.4375, + "learning_rate": 0.01980091749365682, + "loss": 3.2632, + "mean_token_accuracy": 0.3792809247970581, + "num_tokens": 899303313.0, + "step": 1759 + }, + { + "epoch": 0.4759329367225527, + "grad_norm": 3.921875, + "learning_rate": 0.019800587607553027, + "loss": 3.5518, + "mean_token_accuracy": 0.36530423164367676, + "num_tokens": 899827535.0, + "step": 1760 + }, + { + "epoch": 0.47620335316387236, + "grad_norm": 2.984375, + "learning_rate": 0.019800257451421326, + "loss": 3.4459, + "mean_token_accuracy": 0.3665323257446289, + "num_tokens": 900351767.0, + "step": 1761 + }, + { + "epoch": 0.476473769605192, + "grad_norm": 3.0, + "learning_rate": 0.019799927025271846, + "loss": 3.6227, + "mean_token_accuracy": 0.3784589171409607, + "num_tokens": 900782640.0, + "step": 1762 + }, + { + "epoch": 0.47674418604651164, + "grad_norm": 2.40625, + "learning_rate": 0.01979959632911472, + "loss": 3.233, + "mean_token_accuracy": 0.3802822530269623, + "num_tokens": 901306826.0, + "step": 1763 + }, + { + "epoch": 0.4770146024878313, + "grad_norm": 2.59375, + "learning_rate": 0.01979926536296011, + "loss": 3.4967, + "mean_token_accuracy": 0.3660164177417755, + "num_tokens": 901831069.0, + "step": 1764 + }, + { + "epoch": 0.4772850189291509, + "grad_norm": 3.9375, + "learning_rate": 0.019798934126818157, + "loss": 3.7035, + "mean_token_accuracy": 0.35335782170295715, + "num_tokens": 902317494.0, + "step": 1765 + }, + { + "epoch": 0.4775554353704705, + "grad_norm": 2.625, + "learning_rate": 0.01979860262069903, + "loss": 3.3601, + "mean_token_accuracy": 0.373335063457489, + "num_tokens": 902818017.0, + "step": 1766 + }, + { + "epoch": 0.47782585181179016, + "grad_norm": 3.640625, + "learning_rate": 0.0197982708446129, + "loss": 3.3768, + "mean_token_accuracy": 0.3863041400909424, + "num_tokens": 903342174.0, + "step": 1767 + }, + { + "epoch": 0.4780962682531098, + "grad_norm": 3.25, + "learning_rate": 0.019797938798569944, + "loss": 3.4276, + "mean_token_accuracy": 0.35850319266319275, + "num_tokens": 903866400.0, + "step": 1768 + }, + { + "epoch": 0.47836668469442944, + "grad_norm": 2.609375, + "learning_rate": 0.019797606482580356, + "loss": 3.3711, + "mean_token_accuracy": 0.38029783964157104, + "num_tokens": 904390608.0, + "step": 1769 + }, + { + "epoch": 0.47863710113574903, + "grad_norm": 2.484375, + "learning_rate": 0.019797273896654328, + "loss": 3.1741, + "mean_token_accuracy": 0.3852249085903168, + "num_tokens": 904898025.0, + "step": 1770 + }, + { + "epoch": 0.4789075175770687, + "grad_norm": 3.71875, + "learning_rate": 0.019796941040802065, + "loss": 10.7376, + "mean_token_accuracy": 0.0, + "num_tokens": 905422191.0, + "step": 1771 + }, + { + "epoch": 0.4791779340183883, + "grad_norm": 7.78125, + "learning_rate": 0.019796607915033782, + "loss": 4.1952, + "mean_token_accuracy": 0.25412917137145996, + "num_tokens": 905913738.0, + "step": 1772 + }, + { + "epoch": 0.47944835045970796, + "grad_norm": 2.75, + "learning_rate": 0.019796274519359697, + "loss": 3.585, + "mean_token_accuracy": 0.3126397132873535, + "num_tokens": 906437878.0, + "step": 1773 + }, + { + "epoch": 0.4797187669010276, + "grad_norm": 2.53125, + "learning_rate": 0.019795940853790044, + "loss": 3.4849, + "mean_token_accuracy": 0.36392486095428467, + "num_tokens": 906961995.0, + "step": 1774 + }, + { + "epoch": 0.4799891833423472, + "grad_norm": 2.828125, + "learning_rate": 0.019795606918335058, + "loss": 3.4098, + "mean_token_accuracy": 0.35264700651168823, + "num_tokens": 907486100.0, + "step": 1775 + }, + { + "epoch": 0.48025959978366684, + "grad_norm": 2.96875, + "learning_rate": 0.019795272713004984, + "loss": 3.7221, + "mean_token_accuracy": 0.32638639211654663, + "num_tokens": 908010361.0, + "step": 1776 + }, + { + "epoch": 0.4805300162249865, + "grad_norm": 3.859375, + "learning_rate": 0.01979493823781008, + "loss": 3.7043, + "mean_token_accuracy": 0.3388391137123108, + "num_tokens": 908534516.0, + "step": 1777 + }, + { + "epoch": 0.4808004326663061, + "grad_norm": 2.78125, + "learning_rate": 0.019794603492760604, + "loss": 3.3339, + "mean_token_accuracy": 0.3726591467857361, + "num_tokens": 909058713.0, + "step": 1778 + }, + { + "epoch": 0.48107084910762576, + "grad_norm": 4.78125, + "learning_rate": 0.01979426847786683, + "loss": 3.5233, + "mean_token_accuracy": 0.32987040281295776, + "num_tokens": 909582969.0, + "step": 1779 + }, + { + "epoch": 0.48134126554894535, + "grad_norm": 3.28125, + "learning_rate": 0.01979393319313904, + "loss": 3.3655, + "mean_token_accuracy": 0.3497632145881653, + "num_tokens": 910086296.0, + "step": 1780 + }, + { + "epoch": 0.481611681990265, + "grad_norm": 3.0625, + "learning_rate": 0.019793597638587512, + "loss": 3.6299, + "mean_token_accuracy": 0.364399790763855, + "num_tokens": 910571969.0, + "step": 1781 + }, + { + "epoch": 0.48188209843158464, + "grad_norm": 3.953125, + "learning_rate": 0.019793261814222553, + "loss": 3.6002, + "mean_token_accuracy": 0.3369339108467102, + "num_tokens": 911096229.0, + "step": 1782 + }, + { + "epoch": 0.4821525148729043, + "grad_norm": 2.484375, + "learning_rate": 0.01979292572005446, + "loss": 3.2475, + "mean_token_accuracy": 0.40165454149246216, + "num_tokens": 911561471.0, + "step": 1783 + }, + { + "epoch": 0.4824229313142239, + "grad_norm": 2.578125, + "learning_rate": 0.019792589356093553, + "loss": 3.5262, + "mean_token_accuracy": 0.36102116107940674, + "num_tokens": 912041823.0, + "step": 1784 + }, + { + "epoch": 0.4826933477555435, + "grad_norm": 2.6875, + "learning_rate": 0.01979225272235014, + "loss": 3.1892, + "mean_token_accuracy": 0.393157422542572, + "num_tokens": 912565950.0, + "step": 1785 + }, + { + "epoch": 0.48296376419686315, + "grad_norm": 2.734375, + "learning_rate": 0.019791915818834563, + "loss": 3.397, + "mean_token_accuracy": 0.3768187463283539, + "num_tokens": 913051358.0, + "step": 1786 + }, + { + "epoch": 0.4832341806381828, + "grad_norm": 2.578125, + "learning_rate": 0.01979157864555715, + "loss": 3.3381, + "mean_token_accuracy": 0.3847537934780121, + "num_tokens": 913554945.0, + "step": 1787 + }, + { + "epoch": 0.48350459707950244, + "grad_norm": 2.234375, + "learning_rate": 0.01979124120252825, + "loss": 3.2372, + "mean_token_accuracy": 0.3819988965988159, + "num_tokens": 914079116.0, + "step": 1788 + }, + { + "epoch": 0.4837750135208221, + "grad_norm": 3.546875, + "learning_rate": 0.01979090348975822, + "loss": 3.5402, + "mean_token_accuracy": 0.3616035580635071, + "num_tokens": 914603380.0, + "step": 1789 + }, + { + "epoch": 0.48404542996214167, + "grad_norm": 3.09375, + "learning_rate": 0.019790565507257413, + "loss": 3.5445, + "mean_token_accuracy": 0.3538735806941986, + "num_tokens": 915127630.0, + "step": 1790 + }, + { + "epoch": 0.4843158464034613, + "grad_norm": 2.5625, + "learning_rate": 0.019790227255036208, + "loss": 9.5983, + "mean_token_accuracy": 0.0064101070165634155, + "num_tokens": 915651848.0, + "step": 1791 + }, + { + "epoch": 0.48458626284478096, + "grad_norm": 10.625, + "learning_rate": 0.019789888733104978, + "loss": 4.0968, + "mean_token_accuracy": 0.28619569540023804, + "num_tokens": 916121677.0, + "step": 1792 + }, + { + "epoch": 0.4848566792861006, + "grad_norm": 2.8125, + "learning_rate": 0.019789549941474117, + "loss": 3.6557, + "mean_token_accuracy": 0.33985936641693115, + "num_tokens": 916645852.0, + "step": 1793 + }, + { + "epoch": 0.48512709572742024, + "grad_norm": 4.75, + "learning_rate": 0.01978921088015401, + "loss": 3.4577, + "mean_token_accuracy": 0.35815852880477905, + "num_tokens": 917170092.0, + "step": 1794 + }, + { + "epoch": 0.4853975121687399, + "grad_norm": 8.6875, + "learning_rate": 0.019788871549155068, + "loss": 3.3845, + "mean_token_accuracy": 0.36967670917510986, + "num_tokens": 917694152.0, + "step": 1795 + }, + { + "epoch": 0.4856679286100595, + "grad_norm": 2.3125, + "learning_rate": 0.019788531948487703, + "loss": 3.5148, + "mean_token_accuracy": 0.3659049868583679, + "num_tokens": 918218354.0, + "step": 1796 + }, + { + "epoch": 0.4859383450513791, + "grad_norm": 3.5625, + "learning_rate": 0.01978819207816233, + "loss": 3.4918, + "mean_token_accuracy": 0.35022732615470886, + "num_tokens": 918728961.0, + "step": 1797 + }, + { + "epoch": 0.48620876149269876, + "grad_norm": 3.109375, + "learning_rate": 0.019787851938189382, + "loss": 3.0971, + "mean_token_accuracy": 0.4142459034919739, + "num_tokens": 919253140.0, + "step": 1798 + }, + { + "epoch": 0.4864791779340184, + "grad_norm": 3.3125, + "learning_rate": 0.01978751152857929, + "loss": 3.5212, + "mean_token_accuracy": 0.34095361828804016, + "num_tokens": 919777342.0, + "step": 1799 + }, + { + "epoch": 0.48674959437533805, + "grad_norm": 3.140625, + "learning_rate": 0.019787170849342502, + "loss": 3.5343, + "mean_token_accuracy": 0.37006622552871704, + "num_tokens": 920301520.0, + "step": 1800 + }, + { + "epoch": 0.48702001081665763, + "grad_norm": 3.109375, + "learning_rate": 0.019786829900489477, + "loss": 3.8137, + "mean_token_accuracy": 0.3478415906429291, + "num_tokens": 920825804.0, + "step": 1801 + }, + { + "epoch": 0.4872904272579773, + "grad_norm": 2.96875, + "learning_rate": 0.019786488682030662, + "loss": 3.5569, + "mean_token_accuracy": 0.3461820185184479, + "num_tokens": 921350041.0, + "step": 1802 + }, + { + "epoch": 0.4875608436992969, + "grad_norm": 3.421875, + "learning_rate": 0.019786147193976537, + "loss": 3.543, + "mean_token_accuracy": 0.37615007162094116, + "num_tokens": 921819708.0, + "step": 1803 + }, + { + "epoch": 0.48783126014061656, + "grad_norm": 3.046875, + "learning_rate": 0.01978580543633758, + "loss": 3.3468, + "mean_token_accuracy": 0.36727088689804077, + "num_tokens": 922339084.0, + "step": 1804 + }, + { + "epoch": 0.4881016765819362, + "grad_norm": 3.5, + "learning_rate": 0.019785463409124272, + "loss": 3.5897, + "mean_token_accuracy": 0.3681144714355469, + "num_tokens": 922863340.0, + "step": 1805 + }, + { + "epoch": 0.4883720930232558, + "grad_norm": 3.5625, + "learning_rate": 0.019785121112347114, + "loss": 3.4887, + "mean_token_accuracy": 0.33610209822654724, + "num_tokens": 923387438.0, + "step": 1806 + }, + { + "epoch": 0.48864250946457544, + "grad_norm": 2.484375, + "learning_rate": 0.019784778546016603, + "loss": 3.1915, + "mean_token_accuracy": 0.39574626088142395, + "num_tokens": 923911425.0, + "step": 1807 + }, + { + "epoch": 0.4889129259058951, + "grad_norm": 4.375, + "learning_rate": 0.019784435710143253, + "loss": 3.3206, + "mean_token_accuracy": 0.4027346670627594, + "num_tokens": 924435326.0, + "step": 1808 + }, + { + "epoch": 0.4891833423472147, + "grad_norm": 2.9375, + "learning_rate": 0.01978409260473758, + "loss": 3.3278, + "mean_token_accuracy": 0.3857288360595703, + "num_tokens": 924951697.0, + "step": 1809 + }, + { + "epoch": 0.48945375878853437, + "grad_norm": 5.25, + "learning_rate": 0.019783749229810115, + "loss": 3.7132, + "mean_token_accuracy": 0.35733354091644287, + "num_tokens": 925475954.0, + "step": 1810 + }, + { + "epoch": 0.48972417522985395, + "grad_norm": 18.75, + "learning_rate": 0.01978340558537139, + "loss": 13.8032, + "mean_token_accuracy": 0.017868783324956894, + "num_tokens": 925951225.0, + "step": 1811 + }, + { + "epoch": 0.4899945916711736, + "grad_norm": 7.15625, + "learning_rate": 0.019783061671431953, + "loss": 4.2742, + "mean_token_accuracy": 0.29451698064804077, + "num_tokens": 926423681.0, + "step": 1812 + }, + { + "epoch": 0.49026500811249324, + "grad_norm": 2.546875, + "learning_rate": 0.019782717488002356, + "loss": 3.7415, + "mean_token_accuracy": 0.3437204658985138, + "num_tokens": 926947909.0, + "step": 1813 + }, + { + "epoch": 0.4905354245538129, + "grad_norm": 2.90625, + "learning_rate": 0.019782373035093156, + "loss": 3.5243, + "mean_token_accuracy": 0.36827898025512695, + "num_tokens": 927472098.0, + "step": 1814 + }, + { + "epoch": 0.4908058409951325, + "grad_norm": 2.96875, + "learning_rate": 0.019782028312714927, + "loss": 3.6337, + "mean_token_accuracy": 0.3471980392932892, + "num_tokens": 927973232.0, + "step": 1815 + }, + { + "epoch": 0.4910762574364521, + "grad_norm": 2.484375, + "learning_rate": 0.019781683320878242, + "loss": 3.4764, + "mean_token_accuracy": 0.37496355175971985, + "num_tokens": 928497383.0, + "step": 1816 + }, + { + "epoch": 0.49134667387777176, + "grad_norm": 3.359375, + "learning_rate": 0.019781338059593683, + "loss": 3.3855, + "mean_token_accuracy": 0.35589465498924255, + "num_tokens": 929021635.0, + "step": 1817 + }, + { + "epoch": 0.4916170903190914, + "grad_norm": 2.734375, + "learning_rate": 0.019780992528871853, + "loss": 3.4579, + "mean_token_accuracy": 0.37559372186660767, + "num_tokens": 929545815.0, + "step": 1818 + }, + { + "epoch": 0.49188750676041104, + "grad_norm": 3.40625, + "learning_rate": 0.019780646728723348, + "loss": 3.3428, + "mean_token_accuracy": 0.4049016833305359, + "num_tokens": 930069884.0, + "step": 1819 + }, + { + "epoch": 0.4921579232017307, + "grad_norm": 3.0, + "learning_rate": 0.01978030065915878, + "loss": 3.4011, + "mean_token_accuracy": 0.3871833384037018, + "num_tokens": 930593967.0, + "step": 1820 + }, + { + "epoch": 0.4924283396430503, + "grad_norm": 2.9375, + "learning_rate": 0.019779954320188763, + "loss": 3.4494, + "mean_token_accuracy": 0.38509970903396606, + "num_tokens": 931055281.0, + "step": 1821 + }, + { + "epoch": 0.4926987560843699, + "grad_norm": 3.765625, + "learning_rate": 0.01977960771182393, + "loss": 3.5927, + "mean_token_accuracy": 0.37115806341171265, + "num_tokens": 931579527.0, + "step": 1822 + }, + { + "epoch": 0.49296917252568956, + "grad_norm": 3.40625, + "learning_rate": 0.019779260834074913, + "loss": 3.4996, + "mean_token_accuracy": 0.3573734760284424, + "num_tokens": 932077762.0, + "step": 1823 + }, + { + "epoch": 0.4932395889670092, + "grad_norm": 2.734375, + "learning_rate": 0.019778913686952355, + "loss": 3.4129, + "mean_token_accuracy": 0.3876643180847168, + "num_tokens": 932543726.0, + "step": 1824 + }, + { + "epoch": 0.49351000540832884, + "grad_norm": 2.875, + "learning_rate": 0.01977856627046691, + "loss": 3.4499, + "mean_token_accuracy": 0.366586834192276, + "num_tokens": 933067985.0, + "step": 1825 + }, + { + "epoch": 0.49378042184964843, + "grad_norm": 3.71875, + "learning_rate": 0.019778218584629234, + "loss": 3.3473, + "mean_token_accuracy": 0.3576650023460388, + "num_tokens": 933592169.0, + "step": 1826 + }, + { + "epoch": 0.4940508382909681, + "grad_norm": 2.984375, + "learning_rate": 0.019777870629449995, + "loss": 3.5257, + "mean_token_accuracy": 0.36204415559768677, + "num_tokens": 934116345.0, + "step": 1827 + }, + { + "epoch": 0.4943212547322877, + "grad_norm": 3.484375, + "learning_rate": 0.019777522404939876, + "loss": 3.3575, + "mean_token_accuracy": 0.3585737347602844, + "num_tokens": 934640623.0, + "step": 1828 + }, + { + "epoch": 0.49459167117360736, + "grad_norm": 2.8125, + "learning_rate": 0.01977717391110955, + "loss": 3.3507, + "mean_token_accuracy": 0.30667975544929504, + "num_tokens": 935164792.0, + "step": 1829 + }, + { + "epoch": 0.494862087614927, + "grad_norm": 2.84375, + "learning_rate": 0.019776825147969718, + "loss": 3.2778, + "mean_token_accuracy": 0.3890639543533325, + "num_tokens": 935689061.0, + "step": 1830 + }, + { + "epoch": 0.49513250405624665, + "grad_norm": 148.0, + "learning_rate": 0.019776476115531082, + "loss": 17.4587, + "mean_token_accuracy": 1.785375934559852e-05, + "num_tokens": 936150897.0, + "step": 1831 + }, + { + "epoch": 0.49540292049756623, + "grad_norm": 8.6875, + "learning_rate": 0.01977612681380435, + "loss": 4.2326, + "mean_token_accuracy": 0.28762632608413696, + "num_tokens": 936675073.0, + "step": 1832 + }, + { + "epoch": 0.4956733369388859, + "grad_norm": 2.46875, + "learning_rate": 0.019775777242800236, + "loss": 3.5346, + "mean_token_accuracy": 0.3559064269065857, + "num_tokens": 937199284.0, + "step": 1833 + }, + { + "epoch": 0.4959437533802055, + "grad_norm": 2.921875, + "learning_rate": 0.01977542740252947, + "loss": 3.493, + "mean_token_accuracy": 0.35777419805526733, + "num_tokens": 937723453.0, + "step": 1834 + }, + { + "epoch": 0.49621416982152516, + "grad_norm": 2.859375, + "learning_rate": 0.019775077293002784, + "loss": 3.4845, + "mean_token_accuracy": 0.3783171772956848, + "num_tokens": 938247649.0, + "step": 1835 + }, + { + "epoch": 0.4964845862628448, + "grad_norm": 3.65625, + "learning_rate": 0.01977472691423092, + "loss": 3.5265, + "mean_token_accuracy": 0.35681647062301636, + "num_tokens": 938771916.0, + "step": 1836 + }, + { + "epoch": 0.4967550027041644, + "grad_norm": 2.609375, + "learning_rate": 0.01977437626622463, + "loss": 3.4236, + "mean_token_accuracy": 0.3689098060131073, + "num_tokens": 939296197.0, + "step": 1837 + }, + { + "epoch": 0.49702541914548404, + "grad_norm": 3.140625, + "learning_rate": 0.01977402534899467, + "loss": 3.6047, + "mean_token_accuracy": 0.36208608746528625, + "num_tokens": 939792392.0, + "step": 1838 + }, + { + "epoch": 0.4972958355868037, + "grad_norm": 2.78125, + "learning_rate": 0.019773674162551814, + "loss": 3.3299, + "mean_token_accuracy": 0.3716050982475281, + "num_tokens": 940316592.0, + "step": 1839 + }, + { + "epoch": 0.4975662520281233, + "grad_norm": 3.34375, + "learning_rate": 0.01977332270690683, + "loss": 3.5066, + "mean_token_accuracy": 0.3905620574951172, + "num_tokens": 940829445.0, + "step": 1840 + }, + { + "epoch": 0.49783666846944297, + "grad_norm": 2.84375, + "learning_rate": 0.019772970982070506, + "loss": 3.271, + "mean_token_accuracy": 0.38083672523498535, + "num_tokens": 941294035.0, + "step": 1841 + }, + { + "epoch": 0.49810708491076255, + "grad_norm": 2.359375, + "learning_rate": 0.01977261898805363, + "loss": 3.4404, + "mean_token_accuracy": 0.35014253854751587, + "num_tokens": 941817731.0, + "step": 1842 + }, + { + "epoch": 0.4983775013520822, + "grad_norm": 2.546875, + "learning_rate": 0.01977226672486701, + "loss": 3.3623, + "mean_token_accuracy": 0.37028980255126953, + "num_tokens": 942325871.0, + "step": 1843 + }, + { + "epoch": 0.49864791779340184, + "grad_norm": 3.6875, + "learning_rate": 0.019771914192521445, + "loss": 3.2385, + "mean_token_accuracy": 0.3884356915950775, + "num_tokens": 942813451.0, + "step": 1844 + }, + { + "epoch": 0.4989183342347215, + "grad_norm": 4.34375, + "learning_rate": 0.019771561391027755, + "loss": 3.2509, + "mean_token_accuracy": 0.39577963948249817, + "num_tokens": 943337727.0, + "step": 1845 + }, + { + "epoch": 0.4991887506760411, + "grad_norm": 2.359375, + "learning_rate": 0.019771208320396774, + "loss": 3.4, + "mean_token_accuracy": 0.3507879972457886, + "num_tokens": 943861829.0, + "step": 1846 + }, + { + "epoch": 0.4994591671173607, + "grad_norm": 2.9375, + "learning_rate": 0.01977085498063932, + "loss": 3.397, + "mean_token_accuracy": 0.34377992153167725, + "num_tokens": 944386110.0, + "step": 1847 + }, + { + "epoch": 0.49972958355868036, + "grad_norm": 2.484375, + "learning_rate": 0.019770501371766242, + "loss": 3.502, + "mean_token_accuracy": 0.3860476016998291, + "num_tokens": 944884648.0, + "step": 1848 + }, + { + "epoch": 0.5, + "grad_norm": 3.546875, + "learning_rate": 0.01977014749378839, + "loss": 3.5962, + "mean_token_accuracy": 0.3659733235836029, + "num_tokens": 945373079.0, + "step": 1849 + }, + { + "epoch": 0.5002704164413196, + "grad_norm": 2.609375, + "learning_rate": 0.01976979334671662, + "loss": 3.116, + "mean_token_accuracy": 0.3700687289237976, + "num_tokens": 945897166.0, + "step": 1850 + }, + { + "epoch": 0.5005408328826393, + "grad_norm": 74.0, + "learning_rate": 0.01976943893056181, + "loss": 11.4261, + "mean_token_accuracy": 0.005038955248892307, + "num_tokens": 946421439.0, + "step": 1851 + }, + { + "epoch": 0.5008112493239589, + "grad_norm": 8.8125, + "learning_rate": 0.019769084245334814, + "loss": 4.2191, + "mean_token_accuracy": 0.2826707065105438, + "num_tokens": 946921073.0, + "step": 1852 + }, + { + "epoch": 0.5010816657652786, + "grad_norm": 2.578125, + "learning_rate": 0.019768729291046526, + "loss": 3.4746, + "mean_token_accuracy": 0.35247424244880676, + "num_tokens": 947445350.0, + "step": 1853 + }, + { + "epoch": 0.5013520822065982, + "grad_norm": 2.984375, + "learning_rate": 0.019768374067707838, + "loss": 3.5582, + "mean_token_accuracy": 0.3633090853691101, + "num_tokens": 947969549.0, + "step": 1854 + }, + { + "epoch": 0.5016224986479177, + "grad_norm": 3.390625, + "learning_rate": 0.019768018575329647, + "loss": 3.4958, + "mean_token_accuracy": 0.3461088538169861, + "num_tokens": 948493723.0, + "step": 1855 + }, + { + "epoch": 0.5018929150892374, + "grad_norm": 3.4375, + "learning_rate": 0.019767662813922865, + "loss": 3.7091, + "mean_token_accuracy": 0.35045531392097473, + "num_tokens": 949017999.0, + "step": 1856 + }, + { + "epoch": 0.502163331530557, + "grad_norm": 3.140625, + "learning_rate": 0.019767306783498396, + "loss": 3.5131, + "mean_token_accuracy": 0.33321166038513184, + "num_tokens": 949542239.0, + "step": 1857 + }, + { + "epoch": 0.5024337479718767, + "grad_norm": 2.421875, + "learning_rate": 0.019766950484067177, + "loss": 3.5956, + "mean_token_accuracy": 0.3674095571041107, + "num_tokens": 950066385.0, + "step": 1858 + }, + { + "epoch": 0.5027041644131963, + "grad_norm": 2.9375, + "learning_rate": 0.019766593915640136, + "loss": 3.5765, + "mean_token_accuracy": 0.31543299555778503, + "num_tokens": 950590462.0, + "step": 1859 + }, + { + "epoch": 0.502974580854516, + "grad_norm": 2.953125, + "learning_rate": 0.01976623707822821, + "loss": 3.5112, + "mean_token_accuracy": 0.356577605009079, + "num_tokens": 951064320.0, + "step": 1860 + }, + { + "epoch": 0.5032449972958356, + "grad_norm": 3.71875, + "learning_rate": 0.019765879971842353, + "loss": 3.5876, + "mean_token_accuracy": 0.349673330783844, + "num_tokens": 951588587.0, + "step": 1861 + }, + { + "epoch": 0.5035154137371552, + "grad_norm": 2.828125, + "learning_rate": 0.01976552259649352, + "loss": 3.4716, + "mean_token_accuracy": 0.3703804910182953, + "num_tokens": 952112823.0, + "step": 1862 + }, + { + "epoch": 0.5037858301784749, + "grad_norm": 3.109375, + "learning_rate": 0.019765164952192674, + "loss": 3.5573, + "mean_token_accuracy": 0.35987555980682373, + "num_tokens": 952636935.0, + "step": 1863 + }, + { + "epoch": 0.5040562466197945, + "grad_norm": 2.625, + "learning_rate": 0.019764807038950793, + "loss": 3.2807, + "mean_token_accuracy": 0.3811916708946228, + "num_tokens": 953161104.0, + "step": 1864 + }, + { + "epoch": 0.5043266630611141, + "grad_norm": 3.296875, + "learning_rate": 0.019764448856778853, + "loss": 3.419, + "mean_token_accuracy": 0.37460413575172424, + "num_tokens": 953624550.0, + "step": 1865 + }, + { + "epoch": 0.5045970795024337, + "grad_norm": 2.734375, + "learning_rate": 0.01976409040568785, + "loss": 3.4615, + "mean_token_accuracy": 0.35859978199005127, + "num_tokens": 954148826.0, + "step": 1866 + }, + { + "epoch": 0.5048674959437534, + "grad_norm": 4.0, + "learning_rate": 0.01976373168568878, + "loss": 3.3457, + "mean_token_accuracy": 0.3485875129699707, + "num_tokens": 954673088.0, + "step": 1867 + }, + { + "epoch": 0.505137912385073, + "grad_norm": 3.109375, + "learning_rate": 0.019763372696792646, + "loss": 3.3069, + "mean_token_accuracy": 0.3718305826187134, + "num_tokens": 955197205.0, + "step": 1868 + }, + { + "epoch": 0.5054083288263926, + "grad_norm": 3.15625, + "learning_rate": 0.01976301343901047, + "loss": 3.2191, + "mean_token_accuracy": 0.3946496248245239, + "num_tokens": 955721328.0, + "step": 1869 + }, + { + "epoch": 0.5056787452677123, + "grad_norm": 4.375, + "learning_rate": 0.01976265391235327, + "loss": 3.3844, + "mean_token_accuracy": 0.3438328802585602, + "num_tokens": 956245546.0, + "step": 1870 + }, + { + "epoch": 0.5059491617090319, + "grad_norm": 1.71875, + "learning_rate": 0.01976229411683208, + "loss": 11.2899, + "mean_token_accuracy": 1.9807233911706135e-05, + "num_tokens": 956691364.0, + "step": 1871 + }, + { + "epoch": 0.5062195781503516, + "grad_norm": 8.75, + "learning_rate": 0.019761934052457936, + "loss": 4.1696, + "mean_token_accuracy": 0.3008328378200531, + "num_tokens": 957166142.0, + "step": 1872 + }, + { + "epoch": 0.5064899945916712, + "grad_norm": 2.484375, + "learning_rate": 0.01976157371924189, + "loss": 3.5168, + "mean_token_accuracy": 0.3624701201915741, + "num_tokens": 957662170.0, + "step": 1873 + }, + { + "epoch": 0.5067604110329909, + "grad_norm": 3.765625, + "learning_rate": 0.019761213117194993, + "loss": 3.3785, + "mean_token_accuracy": 0.36444783210754395, + "num_tokens": 958186208.0, + "step": 1874 + }, + { + "epoch": 0.5070308274743104, + "grad_norm": 2.4375, + "learning_rate": 0.019760852246328314, + "loss": 3.4143, + "mean_token_accuracy": 0.35577592253685, + "num_tokens": 958710416.0, + "step": 1875 + }, + { + "epoch": 0.50730124391563, + "grad_norm": 2.328125, + "learning_rate": 0.019760491106652928, + "loss": 3.2613, + "mean_token_accuracy": 0.3584468960762024, + "num_tokens": 959230279.0, + "step": 1876 + }, + { + "epoch": 0.5075716603569497, + "grad_norm": 3.265625, + "learning_rate": 0.01976012969817991, + "loss": 3.8095, + "mean_token_accuracy": 0.3334895372390747, + "num_tokens": 959754537.0, + "step": 1877 + }, + { + "epoch": 0.5078420767982693, + "grad_norm": 2.390625, + "learning_rate": 0.01975976802092035, + "loss": 3.1298, + "mean_token_accuracy": 0.3869898319244385, + "num_tokens": 960270560.0, + "step": 1878 + }, + { + "epoch": 0.508112493239589, + "grad_norm": 2.65625, + "learning_rate": 0.019759406074885348, + "loss": 3.3379, + "mean_token_accuracy": 0.3780132234096527, + "num_tokens": 960756152.0, + "step": 1879 + }, + { + "epoch": 0.5083829096809086, + "grad_norm": 2.78125, + "learning_rate": 0.019759043860086003, + "loss": 3.4441, + "mean_token_accuracy": 0.37048590183258057, + "num_tokens": 961280325.0, + "step": 1880 + }, + { + "epoch": 0.5086533261222282, + "grad_norm": 3.671875, + "learning_rate": 0.01975868137653344, + "loss": 3.6549, + "mean_token_accuracy": 0.3604704737663269, + "num_tokens": 961804434.0, + "step": 1881 + }, + { + "epoch": 0.5089237425635479, + "grad_norm": 2.84375, + "learning_rate": 0.01975831862423877, + "loss": 3.4092, + "mean_token_accuracy": 0.37136101722717285, + "num_tokens": 962291583.0, + "step": 1882 + }, + { + "epoch": 0.5091941590048675, + "grad_norm": 3.09375, + "learning_rate": 0.01975795560321313, + "loss": 3.5369, + "mean_token_accuracy": 0.37977904081344604, + "num_tokens": 962756823.0, + "step": 1883 + }, + { + "epoch": 0.5094645754461872, + "grad_norm": 3.5625, + "learning_rate": 0.019757592313467656, + "loss": 3.3813, + "mean_token_accuracy": 0.35103243589401245, + "num_tokens": 963281084.0, + "step": 1884 + }, + { + "epoch": 0.5097349918875067, + "grad_norm": 3.125, + "learning_rate": 0.019757228755013494, + "loss": 3.4663, + "mean_token_accuracy": 0.37576818466186523, + "num_tokens": 963805327.0, + "step": 1885 + }, + { + "epoch": 0.5100054083288263, + "grad_norm": 3.34375, + "learning_rate": 0.0197568649278618, + "loss": 3.2813, + "mean_token_accuracy": 0.3689764440059662, + "num_tokens": 964319631.0, + "step": 1886 + }, + { + "epoch": 0.510275824770146, + "grad_norm": 3.078125, + "learning_rate": 0.01975650083202374, + "loss": 3.5176, + "mean_token_accuracy": 0.3859337568283081, + "num_tokens": 964798262.0, + "step": 1887 + }, + { + "epoch": 0.5105462412114656, + "grad_norm": 2.65625, + "learning_rate": 0.019756136467510482, + "loss": 3.3881, + "mean_token_accuracy": 0.3670305013656616, + "num_tokens": 965264098.0, + "step": 1888 + }, + { + "epoch": 0.5108166576527853, + "grad_norm": 3.78125, + "learning_rate": 0.019755771834333212, + "loss": 3.5087, + "mean_token_accuracy": 0.35154420137405396, + "num_tokens": 965788356.0, + "step": 1889 + }, + { + "epoch": 0.5110870740941049, + "grad_norm": 3.796875, + "learning_rate": 0.01975540693250311, + "loss": 3.4827, + "mean_token_accuracy": 0.3644510507583618, + "num_tokens": 966276058.0, + "step": 1890 + }, + { + "epoch": 0.5113574905354246, + "grad_norm": 4.625, + "learning_rate": 0.019755041762031376, + "loss": 9.9988, + "mean_token_accuracy": 0.01862252503633499, + "num_tokens": 966800196.0, + "step": 1891 + }, + { + "epoch": 0.5116279069767442, + "grad_norm": 7.75, + "learning_rate": 0.019754676322929213, + "loss": 3.8953, + "mean_token_accuracy": 0.28748345375061035, + "num_tokens": 967324368.0, + "step": 1892 + }, + { + "epoch": 0.5118983234180638, + "grad_norm": 2.359375, + "learning_rate": 0.01975431061520783, + "loss": 3.436, + "mean_token_accuracy": 0.35773128271102905, + "num_tokens": 967808159.0, + "step": 1893 + }, + { + "epoch": 0.5121687398593835, + "grad_norm": 2.75, + "learning_rate": 0.01975394463887846, + "loss": 3.4642, + "mean_token_accuracy": 0.36099737882614136, + "num_tokens": 968332369.0, + "step": 1894 + }, + { + "epoch": 0.5124391563007031, + "grad_norm": 3.375, + "learning_rate": 0.01975357839395232, + "loss": 3.51, + "mean_token_accuracy": 0.3671067953109741, + "num_tokens": 968843392.0, + "step": 1895 + }, + { + "epoch": 0.5127095727420227, + "grad_norm": 3.0625, + "learning_rate": 0.019753211880440654, + "loss": 3.4221, + "mean_token_accuracy": 0.38140177726745605, + "num_tokens": 969303900.0, + "step": 1896 + }, + { + "epoch": 0.5129799891833423, + "grad_norm": 2.78125, + "learning_rate": 0.019752845098354704, + "loss": 3.2928, + "mean_token_accuracy": 0.4008479714393616, + "num_tokens": 969792807.0, + "step": 1897 + }, + { + "epoch": 0.513250405624662, + "grad_norm": 2.546875, + "learning_rate": 0.01975247804770573, + "loss": 3.329, + "mean_token_accuracy": 0.3704471290111542, + "num_tokens": 970317088.0, + "step": 1898 + }, + { + "epoch": 0.5135208220659816, + "grad_norm": 2.75, + "learning_rate": 0.01975211072850499, + "loss": 3.1834, + "mean_token_accuracy": 0.39447855949401855, + "num_tokens": 970841194.0, + "step": 1899 + }, + { + "epoch": 0.5137912385073012, + "grad_norm": 3.21875, + "learning_rate": 0.01975174314076375, + "loss": 3.3591, + "mean_token_accuracy": 0.3524714708328247, + "num_tokens": 971354266.0, + "step": 1900 + }, + { + "epoch": 0.5140616549486209, + "grad_norm": 3.453125, + "learning_rate": 0.019751375284493295, + "loss": 3.6189, + "mean_token_accuracy": 0.3390047550201416, + "num_tokens": 971878486.0, + "step": 1901 + }, + { + "epoch": 0.5143320713899405, + "grad_norm": 2.984375, + "learning_rate": 0.01975100715970491, + "loss": 3.4137, + "mean_token_accuracy": 0.3754537105560303, + "num_tokens": 972402746.0, + "step": 1902 + }, + { + "epoch": 0.5146024878312602, + "grad_norm": 2.765625, + "learning_rate": 0.01975063876640989, + "loss": 3.4695, + "mean_token_accuracy": 0.35443228483200073, + "num_tokens": 972927008.0, + "step": 1903 + }, + { + "epoch": 0.5148729042725798, + "grad_norm": 2.75, + "learning_rate": 0.019750270104619543, + "loss": 3.5335, + "mean_token_accuracy": 0.3622863292694092, + "num_tokens": 973451259.0, + "step": 1904 + }, + { + "epoch": 0.5151433207138995, + "grad_norm": 3.15625, + "learning_rate": 0.01974990117434517, + "loss": 3.3324, + "mean_token_accuracy": 0.3656594753265381, + "num_tokens": 973975447.0, + "step": 1905 + }, + { + "epoch": 0.515413737155219, + "grad_norm": 2.671875, + "learning_rate": 0.019749531975598103, + "loss": 3.2018, + "mean_token_accuracy": 0.4217531085014343, + "num_tokens": 974407491.0, + "step": 1906 + }, + { + "epoch": 0.5156841535965386, + "grad_norm": 3.03125, + "learning_rate": 0.01974916250838966, + "loss": 3.3361, + "mean_token_accuracy": 0.37457597255706787, + "num_tokens": 974931672.0, + "step": 1907 + }, + { + "epoch": 0.5159545700378583, + "grad_norm": 2.640625, + "learning_rate": 0.019748792772731182, + "loss": 3.5124, + "mean_token_accuracy": 0.3721730709075928, + "num_tokens": 975455714.0, + "step": 1908 + }, + { + "epoch": 0.5162249864791779, + "grad_norm": 2.953125, + "learning_rate": 0.01974842276863402, + "loss": 3.2297, + "mean_token_accuracy": 0.3960282802581787, + "num_tokens": 975940936.0, + "step": 1909 + }, + { + "epoch": 0.5164954029204976, + "grad_norm": 3.390625, + "learning_rate": 0.019748052496109513, + "loss": 3.6054, + "mean_token_accuracy": 0.3461005985736847, + "num_tokens": 976457376.0, + "step": 1910 + }, + { + "epoch": 0.5167658193618172, + "grad_norm": 41.0, + "learning_rate": 0.019747681955169027, + "loss": 19.8142, + "mean_token_accuracy": 0.04030700773000717, + "num_tokens": 976981493.0, + "step": 1911 + }, + { + "epoch": 0.5170362358031368, + "grad_norm": 10.0625, + "learning_rate": 0.01974731114582394, + "loss": 4.1068, + "mean_token_accuracy": 0.28592145442962646, + "num_tokens": 977412788.0, + "step": 1912 + }, + { + "epoch": 0.5173066522444565, + "grad_norm": 3.390625, + "learning_rate": 0.019746940068085618, + "loss": 3.719, + "mean_token_accuracy": 0.3560512661933899, + "num_tokens": 977877378.0, + "step": 1913 + }, + { + "epoch": 0.5175770686857761, + "grad_norm": 2.875, + "learning_rate": 0.019746568721965452, + "loss": 3.2299, + "mean_token_accuracy": 0.3827301263809204, + "num_tokens": 978401562.0, + "step": 1914 + }, + { + "epoch": 0.5178474851270958, + "grad_norm": 4.34375, + "learning_rate": 0.019746197107474838, + "loss": 3.228, + "mean_token_accuracy": 0.3738860487937927, + "num_tokens": 978896858.0, + "step": 1915 + }, + { + "epoch": 0.5181179015684153, + "grad_norm": 3.390625, + "learning_rate": 0.019745825224625174, + "loss": 3.5764, + "mean_token_accuracy": 0.35140135884284973, + "num_tokens": 979421109.0, + "step": 1916 + }, + { + "epoch": 0.518388318009735, + "grad_norm": 4.375, + "learning_rate": 0.019745453073427875, + "loss": 3.5864, + "mean_token_accuracy": 0.36458051204681396, + "num_tokens": 979945336.0, + "step": 1917 + }, + { + "epoch": 0.5186587344510546, + "grad_norm": 3.453125, + "learning_rate": 0.019745080653894357, + "loss": 3.6087, + "mean_token_accuracy": 0.34931105375289917, + "num_tokens": 980467765.0, + "step": 1918 + }, + { + "epoch": 0.5189291508923742, + "grad_norm": 2.421875, + "learning_rate": 0.019744707966036043, + "loss": 3.3855, + "mean_token_accuracy": 0.3555390238761902, + "num_tokens": 980992029.0, + "step": 1919 + }, + { + "epoch": 0.5191995673336939, + "grad_norm": 2.671875, + "learning_rate": 0.019744335009864375, + "loss": 3.4416, + "mean_token_accuracy": 0.3508220911026001, + "num_tokens": 981516214.0, + "step": 1920 + }, + { + "epoch": 0.5194699837750135, + "grad_norm": 3.765625, + "learning_rate": 0.019743961785390795, + "loss": 3.3542, + "mean_token_accuracy": 0.3851262927055359, + "num_tokens": 982001758.0, + "step": 1921 + }, + { + "epoch": 0.5197404002163332, + "grad_norm": 2.359375, + "learning_rate": 0.01974358829262675, + "loss": 3.5198, + "mean_token_accuracy": 0.3607907295227051, + "num_tokens": 982526035.0, + "step": 1922 + }, + { + "epoch": 0.5200108166576528, + "grad_norm": 4.09375, + "learning_rate": 0.019743214531583703, + "loss": 3.5096, + "mean_token_accuracy": 0.3583819270133972, + "num_tokens": 983050258.0, + "step": 1923 + }, + { + "epoch": 0.5202812330989725, + "grad_norm": 4.75, + "learning_rate": 0.01974284050227312, + "loss": 3.7021, + "mean_token_accuracy": 0.3590031564235687, + "num_tokens": 983531882.0, + "step": 1924 + }, + { + "epoch": 0.5205516495402921, + "grad_norm": 2.859375, + "learning_rate": 0.019742466204706485, + "loss": 3.4096, + "mean_token_accuracy": 0.3739239573478699, + "num_tokens": 984056162.0, + "step": 1925 + }, + { + "epoch": 0.5208220659816117, + "grad_norm": 3.390625, + "learning_rate": 0.019742091638895273, + "loss": 3.4358, + "mean_token_accuracy": 0.36754095554351807, + "num_tokens": 984580269.0, + "step": 1926 + }, + { + "epoch": 0.5210924824229313, + "grad_norm": 3.140625, + "learning_rate": 0.01974171680485098, + "loss": 3.4787, + "mean_token_accuracy": 0.3693883419036865, + "num_tokens": 985099272.0, + "step": 1927 + }, + { + "epoch": 0.5213628988642509, + "grad_norm": 3.3125, + "learning_rate": 0.01974134170258511, + "loss": 3.4742, + "mean_token_accuracy": 0.3697901964187622, + "num_tokens": 985623548.0, + "step": 1928 + }, + { + "epoch": 0.5216333153055706, + "grad_norm": 3.75, + "learning_rate": 0.019740966332109172, + "loss": 3.6479, + "mean_token_accuracy": 0.36165285110473633, + "num_tokens": 986104577.0, + "step": 1929 + }, + { + "epoch": 0.5219037317468902, + "grad_norm": 2.328125, + "learning_rate": 0.019740590693434677, + "loss": 3.499, + "mean_token_accuracy": 0.36844927072525024, + "num_tokens": 986628764.0, + "step": 1930 + }, + { + "epoch": 0.5221741481882098, + "grad_norm": 488.0, + "learning_rate": 0.01974021478657316, + "loss": 11.6406, + "mean_token_accuracy": 0.014408022165298462, + "num_tokens": 987152946.0, + "step": 1931 + }, + { + "epoch": 0.5224445646295295, + "grad_norm": 6.625, + "learning_rate": 0.019739838611536144, + "loss": 3.9994, + "mean_token_accuracy": 0.28215810656547546, + "num_tokens": 987677109.0, + "step": 1932 + }, + { + "epoch": 0.5227149810708491, + "grad_norm": 2.34375, + "learning_rate": 0.019739462168335182, + "loss": 3.4347, + "mean_token_accuracy": 0.35554444789886475, + "num_tokens": 988192745.0, + "step": 1933 + }, + { + "epoch": 0.5229853975121688, + "grad_norm": 2.859375, + "learning_rate": 0.019739085456981817, + "loss": 3.617, + "mean_token_accuracy": 0.3510884642601013, + "num_tokens": 988717014.0, + "step": 1934 + }, + { + "epoch": 0.5232558139534884, + "grad_norm": 2.71875, + "learning_rate": 0.01973870847748761, + "loss": 3.5419, + "mean_token_accuracy": 0.37061357498168945, + "num_tokens": 989241275.0, + "step": 1935 + }, + { + "epoch": 0.5235262303948081, + "grad_norm": 2.796875, + "learning_rate": 0.01973833122986413, + "loss": 3.3511, + "mean_token_accuracy": 0.3837374150753021, + "num_tokens": 989765510.0, + "step": 1936 + }, + { + "epoch": 0.5237966468361276, + "grad_norm": 2.984375, + "learning_rate": 0.01973795371412295, + "loss": 3.3762, + "mean_token_accuracy": 0.39306527376174927, + "num_tokens": 990229945.0, + "step": 1937 + }, + { + "epoch": 0.5240670632774472, + "grad_norm": 2.40625, + "learning_rate": 0.019737575930275652, + "loss": 3.2169, + "mean_token_accuracy": 0.38672325015068054, + "num_tokens": 990679532.0, + "step": 1938 + }, + { + "epoch": 0.5243374797187669, + "grad_norm": 4.25, + "learning_rate": 0.01973719787833383, + "loss": 3.1974, + "mean_token_accuracy": 0.4228203296661377, + "num_tokens": 991203741.0, + "step": 1939 + }, + { + "epoch": 0.5246078961600865, + "grad_norm": 3.09375, + "learning_rate": 0.019736819558309082, + "loss": 3.3416, + "mean_token_accuracy": 0.3678547143936157, + "num_tokens": 991728007.0, + "step": 1940 + }, + { + "epoch": 0.5248783126014062, + "grad_norm": 3.796875, + "learning_rate": 0.01973644097021302, + "loss": 3.6539, + "mean_token_accuracy": 0.3580387830734253, + "num_tokens": 992211303.0, + "step": 1941 + }, + { + "epoch": 0.5251487290427258, + "grad_norm": 2.859375, + "learning_rate": 0.019736062114057252, + "loss": 3.5692, + "mean_token_accuracy": 0.36305785179138184, + "num_tokens": 992698092.0, + "step": 1942 + }, + { + "epoch": 0.5254191454840454, + "grad_norm": 2.59375, + "learning_rate": 0.01973568298985341, + "loss": 3.4661, + "mean_token_accuracy": 0.3483789563179016, + "num_tokens": 993222299.0, + "step": 1943 + }, + { + "epoch": 0.5256895619253651, + "grad_norm": 3.171875, + "learning_rate": 0.01973530359761312, + "loss": 3.4445, + "mean_token_accuracy": 0.37020304799079895, + "num_tokens": 993746475.0, + "step": 1944 + }, + { + "epoch": 0.5259599783666847, + "grad_norm": 3.21875, + "learning_rate": 0.01973492393734803, + "loss": 3.4241, + "mean_token_accuracy": 0.39160460233688354, + "num_tokens": 994228756.0, + "step": 1945 + }, + { + "epoch": 0.5262303948080044, + "grad_norm": 3.625, + "learning_rate": 0.019734544009069786, + "loss": 3.2969, + "mean_token_accuracy": 0.37016934156417847, + "num_tokens": 994753037.0, + "step": 1946 + }, + { + "epoch": 0.5265008112493239, + "grad_norm": 3.296875, + "learning_rate": 0.019734163812790046, + "loss": 3.4989, + "mean_token_accuracy": 0.3686789572238922, + "num_tokens": 995277298.0, + "step": 1947 + }, + { + "epoch": 0.5267712276906436, + "grad_norm": 3.296875, + "learning_rate": 0.01973378334852047, + "loss": 3.2173, + "mean_token_accuracy": 0.417111337184906, + "num_tokens": 995801555.0, + "step": 1948 + }, + { + "epoch": 0.5270416441319632, + "grad_norm": 2.921875, + "learning_rate": 0.019733402616272742, + "loss": 3.4811, + "mean_token_accuracy": 0.3706010580062866, + "num_tokens": 996325802.0, + "step": 1949 + }, + { + "epoch": 0.5273120605732828, + "grad_norm": 10.4375, + "learning_rate": 0.019733021616058537, + "loss": 3.2001, + "mean_token_accuracy": 0.3936331272125244, + "num_tokens": 996814884.0, + "step": 1950 + }, + { + "epoch": 0.5275824770146025, + "grad_norm": 0.58203125, + "learning_rate": 0.019732640347889546, + "loss": 11.136, + "mean_token_accuracy": 8.089116818155162e-06, + "num_tokens": 997338962.0, + "step": 1951 + }, + { + "epoch": 0.5278528934559221, + "grad_norm": 4.71875, + "learning_rate": 0.019732258811777467, + "loss": 3.5557, + "mean_token_accuracy": 0.3382086753845215, + "num_tokens": 997814649.0, + "step": 1952 + }, + { + "epoch": 0.5281233098972418, + "grad_norm": 3.15625, + "learning_rate": 0.01973187700773401, + "loss": 3.7842, + "mean_token_accuracy": 0.3404623866081238, + "num_tokens": 998338900.0, + "step": 1953 + }, + { + "epoch": 0.5283937263385614, + "grad_norm": 4.46875, + "learning_rate": 0.019731494935770887, + "loss": 3.5848, + "mean_token_accuracy": 0.3486936092376709, + "num_tokens": 998863175.0, + "step": 1954 + }, + { + "epoch": 0.528664142779881, + "grad_norm": 3.59375, + "learning_rate": 0.01973111259589982, + "loss": 3.7204, + "mean_token_accuracy": 0.3518904745578766, + "num_tokens": 999358590.0, + "step": 1955 + }, + { + "epoch": 0.5289345592212007, + "grad_norm": 3.90625, + "learning_rate": 0.019730729988132543, + "loss": 3.4751, + "mean_token_accuracy": 0.3551873564720154, + "num_tokens": 999882856.0, + "step": 1956 + }, + { + "epoch": 0.5292049756625202, + "grad_norm": 3.4375, + "learning_rate": 0.019730347112480798, + "loss": 3.7062, + "mean_token_accuracy": 0.3337879180908203, + "num_tokens": 1000394987.0, + "step": 1957 + }, + { + "epoch": 0.5294753921038399, + "grad_norm": 3.25, + "learning_rate": 0.019729963968956325, + "loss": 3.689, + "mean_token_accuracy": 0.34249183535575867, + "num_tokens": 1000919219.0, + "step": 1958 + }, + { + "epoch": 0.5297458085451595, + "grad_norm": 3.75, + "learning_rate": 0.019729580557570885, + "loss": 3.5494, + "mean_token_accuracy": 0.3699413537979126, + "num_tokens": 1001377901.0, + "step": 1959 + }, + { + "epoch": 0.5300162249864792, + "grad_norm": 3.375, + "learning_rate": 0.01972919687833624, + "loss": 3.3706, + "mean_token_accuracy": 0.36509549617767334, + "num_tokens": 1001866488.0, + "step": 1960 + }, + { + "epoch": 0.5302866414277988, + "grad_norm": 4.5, + "learning_rate": 0.019728812931264165, + "loss": 3.7883, + "mean_token_accuracy": 0.3423740565776825, + "num_tokens": 1002376160.0, + "step": 1961 + }, + { + "epoch": 0.5305570578691184, + "grad_norm": 3.140625, + "learning_rate": 0.01972842871636644, + "loss": 3.6386, + "mean_token_accuracy": 0.367281436920166, + "num_tokens": 1002835610.0, + "step": 1962 + }, + { + "epoch": 0.5308274743104381, + "grad_norm": 2.859375, + "learning_rate": 0.019728044233654856, + "loss": 3.1209, + "mean_token_accuracy": 0.3816825747489929, + "num_tokens": 1003340500.0, + "step": 1963 + }, + { + "epoch": 0.5310978907517577, + "grad_norm": 2.78125, + "learning_rate": 0.019727659483141206, + "loss": 3.4736, + "mean_token_accuracy": 0.36095502972602844, + "num_tokens": 1003864631.0, + "step": 1964 + }, + { + "epoch": 0.5313683071930774, + "grad_norm": 3.3125, + "learning_rate": 0.019727274464837293, + "loss": 3.5561, + "mean_token_accuracy": 0.37171873450279236, + "num_tokens": 1004388560.0, + "step": 1965 + }, + { + "epoch": 0.531638723634397, + "grad_norm": 3.296875, + "learning_rate": 0.01972688917875494, + "loss": 3.4285, + "mean_token_accuracy": 0.38080650568008423, + "num_tokens": 1004867680.0, + "step": 1966 + }, + { + "epoch": 0.5319091400757167, + "grad_norm": 4.0, + "learning_rate": 0.01972650362490596, + "loss": 3.2203, + "mean_token_accuracy": 0.3741046190261841, + "num_tokens": 1005391858.0, + "step": 1967 + }, + { + "epoch": 0.5321795565170362, + "grad_norm": 3.28125, + "learning_rate": 0.019726117803302187, + "loss": 3.4756, + "mean_token_accuracy": 0.38367876410484314, + "num_tokens": 1005897359.0, + "step": 1968 + }, + { + "epoch": 0.5324499729583558, + "grad_norm": 4.625, + "learning_rate": 0.019725731713955454, + "loss": 3.158, + "mean_token_accuracy": 0.40744438767433167, + "num_tokens": 1006421542.0, + "step": 1969 + }, + { + "epoch": 0.5327203893996755, + "grad_norm": 3.3125, + "learning_rate": 0.019725345356877613, + "loss": 3.4706, + "mean_token_accuracy": 0.3813997507095337, + "num_tokens": 1006914167.0, + "step": 1970 + }, + { + "epoch": 0.5329908058409951, + "grad_norm": 78.0, + "learning_rate": 0.01972495873208052, + "loss": 20.282, + "mean_token_accuracy": 0.0, + "num_tokens": 1007438382.0, + "step": 1971 + }, + { + "epoch": 0.5332612222823148, + "grad_norm": 9.4375, + "learning_rate": 0.01972457183957603, + "loss": 4.5663, + "mean_token_accuracy": 0.29520636796951294, + "num_tokens": 1007936088.0, + "step": 1972 + }, + { + "epoch": 0.5335316387236344, + "grad_norm": 2.828125, + "learning_rate": 0.019724184679376023, + "loss": 3.7519, + "mean_token_accuracy": 0.3373051583766937, + "num_tokens": 1008411873.0, + "step": 1973 + }, + { + "epoch": 0.533802055164954, + "grad_norm": 3.234375, + "learning_rate": 0.01972379725149237, + "loss": 3.2795, + "mean_token_accuracy": 0.3778421878814697, + "num_tokens": 1008936078.0, + "step": 1974 + }, + { + "epoch": 0.5340724716062737, + "grad_norm": 3.6875, + "learning_rate": 0.019723409555936964, + "loss": 3.7119, + "mean_token_accuracy": 0.32497385144233704, + "num_tokens": 1009460153.0, + "step": 1975 + }, + { + "epoch": 0.5343428880475933, + "grad_norm": 2.984375, + "learning_rate": 0.0197230215927217, + "loss": 3.56, + "mean_token_accuracy": 0.37084323167800903, + "num_tokens": 1009984403.0, + "step": 1976 + }, + { + "epoch": 0.534613304488913, + "grad_norm": 3.34375, + "learning_rate": 0.01972263336185848, + "loss": 3.6029, + "mean_token_accuracy": 0.3272078037261963, + "num_tokens": 1010461386.0, + "step": 1977 + }, + { + "epoch": 0.5348837209302325, + "grad_norm": 2.515625, + "learning_rate": 0.019722244863359214, + "loss": 3.4332, + "mean_token_accuracy": 0.38930702209472656, + "num_tokens": 1010954224.0, + "step": 1978 + }, + { + "epoch": 0.5351541373715522, + "grad_norm": 2.84375, + "learning_rate": 0.01972185609723583, + "loss": 3.3254, + "mean_token_accuracy": 0.3726305067539215, + "num_tokens": 1011416103.0, + "step": 1979 + }, + { + "epoch": 0.5354245538128718, + "grad_norm": 2.953125, + "learning_rate": 0.01972146706350025, + "loss": 3.3291, + "mean_token_accuracy": 0.3645083010196686, + "num_tokens": 1011910167.0, + "step": 1980 + }, + { + "epoch": 0.5356949702541914, + "grad_norm": 3.015625, + "learning_rate": 0.01972107776216441, + "loss": 3.5628, + "mean_token_accuracy": 0.3676344156265259, + "num_tokens": 1012434277.0, + "step": 1981 + }, + { + "epoch": 0.5359653866955111, + "grad_norm": 3.796875, + "learning_rate": 0.019720688193240264, + "loss": 3.8432, + "mean_token_accuracy": 0.35188746452331543, + "num_tokens": 1012917857.0, + "step": 1982 + }, + { + "epoch": 0.5362358031368307, + "grad_norm": 2.0625, + "learning_rate": 0.01972029835673975, + "loss": 3.3004, + "mean_token_accuracy": 0.3699580729007721, + "num_tokens": 1013442117.0, + "step": 1983 + }, + { + "epoch": 0.5365062195781504, + "grad_norm": 2.796875, + "learning_rate": 0.019719908252674844, + "loss": 3.5645, + "mean_token_accuracy": 0.36684471368789673, + "num_tokens": 1013918253.0, + "step": 1984 + }, + { + "epoch": 0.53677663601947, + "grad_norm": 3.015625, + "learning_rate": 0.019719517881057504, + "loss": 3.6867, + "mean_token_accuracy": 0.3607637882232666, + "num_tokens": 1014442535.0, + "step": 1985 + }, + { + "epoch": 0.5370470524607897, + "grad_norm": 3.296875, + "learning_rate": 0.01971912724189972, + "loss": 3.5574, + "mean_token_accuracy": 0.3446879982948303, + "num_tokens": 1014966760.0, + "step": 1986 + }, + { + "epoch": 0.5373174689021093, + "grad_norm": 3.0, + "learning_rate": 0.019718736335213462, + "loss": 3.5208, + "mean_token_accuracy": 0.3420591354370117, + "num_tokens": 1015491034.0, + "step": 1987 + }, + { + "epoch": 0.5375878853434288, + "grad_norm": 3.03125, + "learning_rate": 0.019718345161010738, + "loss": 2.9791, + "mean_token_accuracy": 0.39177194237709045, + "num_tokens": 1016015228.0, + "step": 1988 + }, + { + "epoch": 0.5378583017847485, + "grad_norm": 3.171875, + "learning_rate": 0.019717953719303544, + "loss": 3.3783, + "mean_token_accuracy": 0.3621375560760498, + "num_tokens": 1016539498.0, + "step": 1989 + }, + { + "epoch": 0.5381287182260681, + "grad_norm": 2.6875, + "learning_rate": 0.019717562010103894, + "loss": 3.2973, + "mean_token_accuracy": 0.35478484630584717, + "num_tokens": 1017063693.0, + "step": 1990 + }, + { + "epoch": 0.5383991346673878, + "grad_norm": 4.28125, + "learning_rate": 0.019717170033423805, + "loss": 10.2231, + "mean_token_accuracy": 1.7130783817265183e-05, + "num_tokens": 1017587970.0, + "step": 1991 + }, + { + "epoch": 0.5386695511087074, + "grad_norm": 6.40625, + "learning_rate": 0.019716777789275303, + "loss": 3.941, + "mean_token_accuracy": 0.32931625843048096, + "num_tokens": 1018082926.0, + "step": 1992 + }, + { + "epoch": 0.538939967550027, + "grad_norm": 2.625, + "learning_rate": 0.01971638527767042, + "loss": 3.5347, + "mean_token_accuracy": 0.35247802734375, + "num_tokens": 1018607196.0, + "step": 1993 + }, + { + "epoch": 0.5392103839913467, + "grad_norm": 4.15625, + "learning_rate": 0.01971599249862121, + "loss": 3.6708, + "mean_token_accuracy": 0.35138753056526184, + "num_tokens": 1019071311.0, + "step": 1994 + }, + { + "epoch": 0.5394808004326663, + "grad_norm": 2.671875, + "learning_rate": 0.019715599452139715, + "loss": 3.5531, + "mean_token_accuracy": 0.37509554624557495, + "num_tokens": 1019532982.0, + "step": 1995 + }, + { + "epoch": 0.539751216873986, + "grad_norm": 3.0, + "learning_rate": 0.019715206138238, + "loss": 3.4406, + "mean_token_accuracy": 0.36031994223594666, + "num_tokens": 1020057235.0, + "step": 1996 + }, + { + "epoch": 0.5400216333153056, + "grad_norm": 2.625, + "learning_rate": 0.019714812556928127, + "loss": 3.5998, + "mean_token_accuracy": 0.3521052598953247, + "num_tokens": 1020581348.0, + "step": 1997 + }, + { + "epoch": 0.5402920497566251, + "grad_norm": 2.828125, + "learning_rate": 0.01971441870822218, + "loss": 3.5732, + "mean_token_accuracy": 0.3421984910964966, + "num_tokens": 1021105634.0, + "step": 1998 + }, + { + "epoch": 0.5405624661979448, + "grad_norm": 2.921875, + "learning_rate": 0.01971402459213224, + "loss": 3.4618, + "mean_token_accuracy": 0.37523773312568665, + "num_tokens": 1021622425.0, + "step": 1999 + }, + { + "epoch": 0.5408328826392644, + "grad_norm": 3.3125, + "learning_rate": 0.019713630208670396, + "loss": 3.5146, + "mean_token_accuracy": 0.3824162185192108, + "num_tokens": 1022137491.0, + "step": 2000 + }, + { + "epoch": 0.5411032990805841, + "grad_norm": 4.96875, + "learning_rate": 0.019713235557848752, + "loss": 3.4249, + "mean_token_accuracy": 0.3905680775642395, + "num_tokens": 1022661755.0, + "step": 2001 + }, + { + "epoch": 0.5413737155219037, + "grad_norm": 2.734375, + "learning_rate": 0.01971284063967942, + "loss": 3.4433, + "mean_token_accuracy": 0.3698263168334961, + "num_tokens": 1023172336.0, + "step": 2002 + }, + { + "epoch": 0.5416441319632234, + "grad_norm": 4.9375, + "learning_rate": 0.01971244545417451, + "loss": 3.5734, + "mean_token_accuracy": 0.3404613137245178, + "num_tokens": 1023696517.0, + "step": 2003 + }, + { + "epoch": 0.541914548404543, + "grad_norm": 4.3125, + "learning_rate": 0.019712050001346154, + "loss": 3.2123, + "mean_token_accuracy": 0.38299500942230225, + "num_tokens": 1024220795.0, + "step": 2004 + }, + { + "epoch": 0.5421849648458626, + "grad_norm": 3.421875, + "learning_rate": 0.019711654281206487, + "loss": 3.5256, + "mean_token_accuracy": 0.3756309151649475, + "num_tokens": 1024686002.0, + "step": 2005 + }, + { + "epoch": 0.5424553812871823, + "grad_norm": 3.859375, + "learning_rate": 0.019711258293767644, + "loss": 3.6222, + "mean_token_accuracy": 0.3535250425338745, + "num_tokens": 1025210234.0, + "step": 2006 + }, + { + "epoch": 0.5427257977285019, + "grad_norm": 2.71875, + "learning_rate": 0.019710862039041776, + "loss": 3.1587, + "mean_token_accuracy": 0.3611626625061035, + "num_tokens": 1025734425.0, + "step": 2007 + }, + { + "epoch": 0.5429962141698216, + "grad_norm": 4.9375, + "learning_rate": 0.01971046551704105, + "loss": 3.5053, + "mean_token_accuracy": 0.39272600412368774, + "num_tokens": 1026258599.0, + "step": 2008 + }, + { + "epoch": 0.5432666306111411, + "grad_norm": 2.8125, + "learning_rate": 0.01971006872777762, + "loss": 3.4696, + "mean_token_accuracy": 0.33623841404914856, + "num_tokens": 1026782714.0, + "step": 2009 + }, + { + "epoch": 0.5435370470524608, + "grad_norm": 2.34375, + "learning_rate": 0.019709671671263668, + "loss": 3.3275, + "mean_token_accuracy": 0.3794398903846741, + "num_tokens": 1027306941.0, + "step": 2010 + }, + { + "epoch": 0.5438074634937804, + "grad_norm": 17.625, + "learning_rate": 0.01970927434751138, + "loss": 12.5089, + "mean_token_accuracy": 0.011865723878145218, + "num_tokens": 1027831082.0, + "step": 2011 + }, + { + "epoch": 0.5440778799351, + "grad_norm": 6.375, + "learning_rate": 0.019708876756532937, + "loss": 3.841, + "mean_token_accuracy": 0.2911508083343506, + "num_tokens": 1028355368.0, + "step": 2012 + }, + { + "epoch": 0.5443482963764197, + "grad_norm": 2.21875, + "learning_rate": 0.01970847889834055, + "loss": 3.2993, + "mean_token_accuracy": 0.36200687289237976, + "num_tokens": 1028879415.0, + "step": 2013 + }, + { + "epoch": 0.5446187128177393, + "grad_norm": 2.09375, + "learning_rate": 0.019708080772946418, + "loss": 3.2103, + "mean_token_accuracy": 0.38818663358688354, + "num_tokens": 1029403646.0, + "step": 2014 + }, + { + "epoch": 0.544889129259059, + "grad_norm": 3.21875, + "learning_rate": 0.019707682380362756, + "loss": 3.4903, + "mean_token_accuracy": 0.3649061918258667, + "num_tokens": 1029882812.0, + "step": 2015 + }, + { + "epoch": 0.5451595457003786, + "grad_norm": 2.265625, + "learning_rate": 0.019707283720601793, + "loss": 3.143, + "mean_token_accuracy": 0.37744539976119995, + "num_tokens": 1030406926.0, + "step": 2016 + }, + { + "epoch": 0.5454299621416983, + "grad_norm": 2.5625, + "learning_rate": 0.019706884793675758, + "loss": 3.5807, + "mean_token_accuracy": 0.3261260390281677, + "num_tokens": 1030931201.0, + "step": 2017 + }, + { + "epoch": 0.5457003785830179, + "grad_norm": 2.828125, + "learning_rate": 0.019706485599596896, + "loss": 3.3276, + "mean_token_accuracy": 0.37232300639152527, + "num_tokens": 1031406749.0, + "step": 2018 + }, + { + "epoch": 0.5459707950243374, + "grad_norm": 2.625, + "learning_rate": 0.01970608613837745, + "loss": 3.4299, + "mean_token_accuracy": 0.3736059367656708, + "num_tokens": 1031930986.0, + "step": 2019 + }, + { + "epoch": 0.5462412114656571, + "grad_norm": 2.40625, + "learning_rate": 0.019705686410029673, + "loss": 3.3717, + "mean_token_accuracy": 0.36974284052848816, + "num_tokens": 1032455225.0, + "step": 2020 + }, + { + "epoch": 0.5465116279069767, + "grad_norm": 2.328125, + "learning_rate": 0.019705286414565842, + "loss": 3.3605, + "mean_token_accuracy": 0.38146644830703735, + "num_tokens": 1032979444.0, + "step": 2021 + }, + { + "epoch": 0.5467820443482964, + "grad_norm": 3.109375, + "learning_rate": 0.01970488615199822, + "loss": 3.4682, + "mean_token_accuracy": 0.3512589931488037, + "num_tokens": 1033503638.0, + "step": 2022 + }, + { + "epoch": 0.547052460789616, + "grad_norm": 3.765625, + "learning_rate": 0.01970448562233909, + "loss": 3.4848, + "mean_token_accuracy": 0.36031436920166016, + "num_tokens": 1034027702.0, + "step": 2023 + }, + { + "epoch": 0.5473228772309356, + "grad_norm": 2.578125, + "learning_rate": 0.01970408482560075, + "loss": 3.2582, + "mean_token_accuracy": 0.3775365948677063, + "num_tokens": 1034551959.0, + "step": 2024 + }, + { + "epoch": 0.5475932936722553, + "grad_norm": 3.875, + "learning_rate": 0.019703683761795484, + "loss": 3.2959, + "mean_token_accuracy": 0.38721105456352234, + "num_tokens": 1035076085.0, + "step": 2025 + }, + { + "epoch": 0.5478637101135749, + "grad_norm": 2.46875, + "learning_rate": 0.019703282430935606, + "loss": 3.1899, + "mean_token_accuracy": 0.3769124746322632, + "num_tokens": 1035555927.0, + "step": 2026 + }, + { + "epoch": 0.5481341265548946, + "grad_norm": 2.671875, + "learning_rate": 0.01970288083303343, + "loss": 3.4359, + "mean_token_accuracy": 0.39497947692871094, + "num_tokens": 1036021227.0, + "step": 2027 + }, + { + "epoch": 0.5484045429962142, + "grad_norm": 4.65625, + "learning_rate": 0.019702478968101275, + "loss": 3.5848, + "mean_token_accuracy": 0.37219297885894775, + "num_tokens": 1036525535.0, + "step": 2028 + }, + { + "epoch": 0.5486749594375337, + "grad_norm": 3.90625, + "learning_rate": 0.019702076836151473, + "loss": 3.5839, + "mean_token_accuracy": 0.3691689968109131, + "num_tokens": 1037049812.0, + "step": 2029 + }, + { + "epoch": 0.5489453758788534, + "grad_norm": 3.546875, + "learning_rate": 0.01970167443719636, + "loss": 3.5167, + "mean_token_accuracy": 0.370718389749527, + "num_tokens": 1037529737.0, + "step": 2030 + }, + { + "epoch": 0.549215792320173, + "grad_norm": 146.0, + "learning_rate": 0.01970127177124829, + "loss": 16.538, + "mean_token_accuracy": 0.011181168258190155, + "num_tokens": 1038053860.0, + "step": 2031 + }, + { + "epoch": 0.5494862087614927, + "grad_norm": 8.0, + "learning_rate": 0.01970086883831961, + "loss": 4.0966, + "mean_token_accuracy": 0.33551889657974243, + "num_tokens": 1038533895.0, + "step": 2032 + }, + { + "epoch": 0.5497566252028123, + "grad_norm": 2.015625, + "learning_rate": 0.019700465638422687, + "loss": 3.5078, + "mean_token_accuracy": 0.3607105612754822, + "num_tokens": 1039058054.0, + "step": 2033 + }, + { + "epoch": 0.550027041644132, + "grad_norm": 2.09375, + "learning_rate": 0.019700062171569895, + "loss": 3.6307, + "mean_token_accuracy": 0.3692691922187805, + "num_tokens": 1039540522.0, + "step": 2034 + }, + { + "epoch": 0.5502974580854516, + "grad_norm": 3.359375, + "learning_rate": 0.019699658437773604, + "loss": 3.4824, + "mean_token_accuracy": 0.36693257093429565, + "num_tokens": 1040064707.0, + "step": 2035 + }, + { + "epoch": 0.5505678745267713, + "grad_norm": 4.21875, + "learning_rate": 0.01969925443704621, + "loss": 3.517, + "mean_token_accuracy": 0.35605451464653015, + "num_tokens": 1040548978.0, + "step": 2036 + }, + { + "epoch": 0.5508382909680909, + "grad_norm": 3.09375, + "learning_rate": 0.019698850169400112, + "loss": 3.2098, + "mean_token_accuracy": 0.35838979482650757, + "num_tokens": 1041073021.0, + "step": 2037 + }, + { + "epoch": 0.5511087074094105, + "grad_norm": 2.578125, + "learning_rate": 0.019698445634847708, + "loss": 3.5515, + "mean_token_accuracy": 0.36732423305511475, + "num_tokens": 1041597280.0, + "step": 2038 + }, + { + "epoch": 0.5513791238507302, + "grad_norm": 3.59375, + "learning_rate": 0.01969804083340141, + "loss": 3.4947, + "mean_token_accuracy": 0.3633720874786377, + "num_tokens": 1042121463.0, + "step": 2039 + }, + { + "epoch": 0.5516495402920497, + "grad_norm": 2.40625, + "learning_rate": 0.019697635765073634, + "loss": 3.2305, + "mean_token_accuracy": 0.37817344069480896, + "num_tokens": 1042645696.0, + "step": 2040 + }, + { + "epoch": 0.5519199567333694, + "grad_norm": 2.953125, + "learning_rate": 0.01969723042987682, + "loss": 3.3569, + "mean_token_accuracy": 0.38574841618537903, + "num_tokens": 1043111089.0, + "step": 2041 + }, + { + "epoch": 0.552190373174689, + "grad_norm": 3.546875, + "learning_rate": 0.019696824827823402, + "loss": 3.6557, + "mean_token_accuracy": 0.3606376349925995, + "num_tokens": 1043600533.0, + "step": 2042 + }, + { + "epoch": 0.5524607896160086, + "grad_norm": 2.984375, + "learning_rate": 0.019696418958925817, + "loss": 3.3814, + "mean_token_accuracy": 0.36482277512550354, + "num_tokens": 1044124817.0, + "step": 2043 + }, + { + "epoch": 0.5527312060573283, + "grad_norm": 3.140625, + "learning_rate": 0.01969601282319653, + "loss": 3.3917, + "mean_token_accuracy": 0.36891013383865356, + "num_tokens": 1044648945.0, + "step": 2044 + }, + { + "epoch": 0.5530016224986479, + "grad_norm": 3.046875, + "learning_rate": 0.019695606420647993, + "loss": 3.3822, + "mean_token_accuracy": 0.3524155616760254, + "num_tokens": 1045173188.0, + "step": 2045 + }, + { + "epoch": 0.5532720389399676, + "grad_norm": 3.25, + "learning_rate": 0.019695199751292677, + "loss": 3.4073, + "mean_token_accuracy": 0.350972443819046, + "num_tokens": 1045697278.0, + "step": 2046 + }, + { + "epoch": 0.5535424553812872, + "grad_norm": 3.234375, + "learning_rate": 0.019694792815143067, + "loss": 3.5684, + "mean_token_accuracy": 0.38109123706817627, + "num_tokens": 1046174902.0, + "step": 2047 + }, + { + "epoch": 0.5538128718226069, + "grad_norm": 2.796875, + "learning_rate": 0.01969438561221164, + "loss": 3.3806, + "mean_token_accuracy": 0.38619735836982727, + "num_tokens": 1046641395.0, + "step": 2048 + }, + { + "epoch": 0.5540832882639265, + "grad_norm": 2.625, + "learning_rate": 0.019693978142510893, + "loss": 3.4335, + "mean_token_accuracy": 0.36869722604751587, + "num_tokens": 1047129089.0, + "step": 2049 + }, + { + "epoch": 0.554353704705246, + "grad_norm": 2.84375, + "learning_rate": 0.01969357040605333, + "loss": 3.2543, + "mean_token_accuracy": 0.385199636220932, + "num_tokens": 1047641419.0, + "step": 2050 + }, + { + "epoch": 0.5546241211465657, + "grad_norm": 366.0, + "learning_rate": 0.019693162402851464, + "loss": 30.8351, + "mean_token_accuracy": 0.0002538474800530821, + "num_tokens": 1048165634.0, + "step": 2051 + }, + { + "epoch": 0.5548945375878853, + "grad_norm": 8.125, + "learning_rate": 0.019692754132917805, + "loss": 4.0594, + "mean_token_accuracy": 0.2658936679363251, + "num_tokens": 1048689895.0, + "step": 2052 + }, + { + "epoch": 0.555164954029205, + "grad_norm": 2.703125, + "learning_rate": 0.01969234559626489, + "loss": 3.5618, + "mean_token_accuracy": 0.351884663105011, + "num_tokens": 1049214178.0, + "step": 2053 + }, + { + "epoch": 0.5554353704705246, + "grad_norm": 2.6875, + "learning_rate": 0.01969193679290525, + "loss": 3.3411, + "mean_token_accuracy": 0.3439875543117523, + "num_tokens": 1049738297.0, + "step": 2054 + }, + { + "epoch": 0.5557057869118442, + "grad_norm": 3.53125, + "learning_rate": 0.019691527722851425, + "loss": 3.5038, + "mean_token_accuracy": 0.36031556129455566, + "num_tokens": 1050262492.0, + "step": 2055 + }, + { + "epoch": 0.5559762033531639, + "grad_norm": 4.0625, + "learning_rate": 0.01969111838611597, + "loss": 3.5184, + "mean_token_accuracy": 0.3656240701675415, + "num_tokens": 1050786776.0, + "step": 2056 + }, + { + "epoch": 0.5562466197944835, + "grad_norm": 2.9375, + "learning_rate": 0.019690708782711445, + "loss": 3.408, + "mean_token_accuracy": 0.38464081287384033, + "num_tokens": 1051310959.0, + "step": 2057 + }, + { + "epoch": 0.5565170362358032, + "grad_norm": 3.40625, + "learning_rate": 0.019690298912650415, + "loss": 3.6114, + "mean_token_accuracy": 0.3553841710090637, + "num_tokens": 1051781050.0, + "step": 2058 + }, + { + "epoch": 0.5567874526771228, + "grad_norm": 2.203125, + "learning_rate": 0.01968988877594546, + "loss": 3.3085, + "mean_token_accuracy": 0.38375240564346313, + "num_tokens": 1052305261.0, + "step": 2059 + }, + { + "epoch": 0.5570578691184424, + "grad_norm": 2.703125, + "learning_rate": 0.01968947837260916, + "loss": 3.41, + "mean_token_accuracy": 0.36356326937675476, + "num_tokens": 1052829456.0, + "step": 2060 + }, + { + "epoch": 0.557328285559762, + "grad_norm": 3.578125, + "learning_rate": 0.019689067702654113, + "loss": 3.478, + "mean_token_accuracy": 0.35715562105178833, + "num_tokens": 1053353707.0, + "step": 2061 + }, + { + "epoch": 0.5575987020010816, + "grad_norm": 3.53125, + "learning_rate": 0.019688656766092914, + "loss": 3.3496, + "mean_token_accuracy": 0.3785877525806427, + "num_tokens": 1053877992.0, + "step": 2062 + }, + { + "epoch": 0.5578691184424013, + "grad_norm": 2.59375, + "learning_rate": 0.01968824556293817, + "loss": 3.5973, + "mean_token_accuracy": 0.3525284230709076, + "num_tokens": 1054385114.0, + "step": 2063 + }, + { + "epoch": 0.5581395348837209, + "grad_norm": 2.78125, + "learning_rate": 0.019687834093202506, + "loss": 3.3495, + "mean_token_accuracy": 0.3910757303237915, + "num_tokens": 1054909272.0, + "step": 2064 + }, + { + "epoch": 0.5584099513250406, + "grad_norm": 3.015625, + "learning_rate": 0.01968742235689854, + "loss": 3.4648, + "mean_token_accuracy": 0.3731226921081543, + "num_tokens": 1055405392.0, + "step": 2065 + }, + { + "epoch": 0.5586803677663602, + "grad_norm": 3.578125, + "learning_rate": 0.01968701035403891, + "loss": 3.4239, + "mean_token_accuracy": 0.37147432565689087, + "num_tokens": 1055882768.0, + "step": 2066 + }, + { + "epoch": 0.5589507842076799, + "grad_norm": 2.5625, + "learning_rate": 0.01968659808463626, + "loss": 3.0727, + "mean_token_accuracy": 0.40781930088996887, + "num_tokens": 1056406951.0, + "step": 2067 + }, + { + "epoch": 0.5592212006489995, + "grad_norm": 2.5625, + "learning_rate": 0.019686185548703227, + "loss": 3.3771, + "mean_token_accuracy": 0.3589591383934021, + "num_tokens": 1056930959.0, + "step": 2068 + }, + { + "epoch": 0.5594916170903191, + "grad_norm": 2.859375, + "learning_rate": 0.01968577274625248, + "loss": 3.4744, + "mean_token_accuracy": 0.3776007890701294, + "num_tokens": 1057455160.0, + "step": 2069 + }, + { + "epoch": 0.5597620335316387, + "grad_norm": 3.65625, + "learning_rate": 0.01968535967729668, + "loss": 3.4283, + "mean_token_accuracy": 0.374987930059433, + "num_tokens": 1057936383.0, + "step": 2070 + }, + { + "epoch": 0.5600324499729583, + "grad_norm": 3.109375, + "learning_rate": 0.01968494634184851, + "loss": 10.8175, + "mean_token_accuracy": 3.212452429579571e-05, + "num_tokens": 1058415005.0, + "step": 2071 + }, + { + "epoch": 0.560302866414278, + "grad_norm": 6.0625, + "learning_rate": 0.019684532739920636, + "loss": 4.0477, + "mean_token_accuracy": 0.31626448035240173, + "num_tokens": 1058913740.0, + "step": 2072 + }, + { + "epoch": 0.5605732828555976, + "grad_norm": 2.1875, + "learning_rate": 0.019684118871525765, + "loss": 3.6213, + "mean_token_accuracy": 0.37143006920814514, + "num_tokens": 1059411238.0, + "step": 2073 + }, + { + "epoch": 0.5608436992969172, + "grad_norm": 3.0625, + "learning_rate": 0.019683704736676588, + "loss": 3.4272, + "mean_token_accuracy": 0.37018534541130066, + "num_tokens": 1059875345.0, + "step": 2074 + }, + { + "epoch": 0.5611141157382369, + "grad_norm": 3.34375, + "learning_rate": 0.019683290335385812, + "loss": 3.489, + "mean_token_accuracy": 0.3659268617630005, + "num_tokens": 1060399559.0, + "step": 2075 + }, + { + "epoch": 0.5613845321795565, + "grad_norm": 3.09375, + "learning_rate": 0.01968287566766615, + "loss": 3.4758, + "mean_token_accuracy": 0.36598414182662964, + "num_tokens": 1060923791.0, + "step": 2076 + }, + { + "epoch": 0.5616549486208762, + "grad_norm": 3.609375, + "learning_rate": 0.019682460733530333, + "loss": 3.4319, + "mean_token_accuracy": 0.3801896572113037, + "num_tokens": 1061447956.0, + "step": 2077 + }, + { + "epoch": 0.5619253650621958, + "grad_norm": 2.765625, + "learning_rate": 0.01968204553299108, + "loss": 3.2641, + "mean_token_accuracy": 0.37443625926971436, + "num_tokens": 1061972228.0, + "step": 2078 + }, + { + "epoch": 0.5621957815035155, + "grad_norm": 3.125, + "learning_rate": 0.01968163006606114, + "loss": 3.6076, + "mean_token_accuracy": 0.35244351625442505, + "num_tokens": 1062496399.0, + "step": 2079 + }, + { + "epoch": 0.5624661979448351, + "grad_norm": 2.671875, + "learning_rate": 0.019681214332753265, + "loss": 3.3293, + "mean_token_accuracy": 0.3842916190624237, + "num_tokens": 1063020624.0, + "step": 2080 + }, + { + "epoch": 0.5627366143861546, + "grad_norm": 3.359375, + "learning_rate": 0.019680798333080198, + "loss": 3.3721, + "mean_token_accuracy": 0.3583354353904724, + "num_tokens": 1063544821.0, + "step": 2081 + }, + { + "epoch": 0.5630070308274743, + "grad_norm": 2.828125, + "learning_rate": 0.01968038206705471, + "loss": 3.3675, + "mean_token_accuracy": 0.369820237159729, + "num_tokens": 1064069081.0, + "step": 2082 + }, + { + "epoch": 0.5632774472687939, + "grad_norm": 3.953125, + "learning_rate": 0.019679965534689575, + "loss": 3.4474, + "mean_token_accuracy": 0.35094255208969116, + "num_tokens": 1064593182.0, + "step": 2083 + }, + { + "epoch": 0.5635478637101136, + "grad_norm": 2.265625, + "learning_rate": 0.019679548735997567, + "loss": 3.4684, + "mean_token_accuracy": 0.36862868070602417, + "num_tokens": 1065091812.0, + "step": 2084 + }, + { + "epoch": 0.5638182801514332, + "grad_norm": 3.25, + "learning_rate": 0.019679131670991487, + "loss": 3.3226, + "mean_token_accuracy": 0.37758326530456543, + "num_tokens": 1065581943.0, + "step": 2085 + }, + { + "epoch": 0.5640886965927528, + "grad_norm": 4.09375, + "learning_rate": 0.019678714339684118, + "loss": 3.4714, + "mean_token_accuracy": 0.36915361881256104, + "num_tokens": 1066106170.0, + "step": 2086 + }, + { + "epoch": 0.5643591130340725, + "grad_norm": 3.90625, + "learning_rate": 0.01967829674208827, + "loss": 3.4081, + "mean_token_accuracy": 0.3628978133201599, + "num_tokens": 1066630379.0, + "step": 2087 + }, + { + "epoch": 0.5646295294753921, + "grad_norm": 2.984375, + "learning_rate": 0.019677878878216756, + "loss": 3.2062, + "mean_token_accuracy": 0.3876727223396301, + "num_tokens": 1067154654.0, + "step": 2088 + }, + { + "epoch": 0.5648999459167118, + "grad_norm": 3.0, + "learning_rate": 0.019677460748082402, + "loss": 3.3115, + "mean_token_accuracy": 0.37568604946136475, + "num_tokens": 1067678808.0, + "step": 2089 + }, + { + "epoch": 0.5651703623580314, + "grad_norm": 2.65625, + "learning_rate": 0.019677042351698034, + "loss": 3.4288, + "mean_token_accuracy": 0.3923162519931793, + "num_tokens": 1068148463.0, + "step": 2090 + }, + { + "epoch": 0.565440778799351, + "grad_norm": 17.875, + "learning_rate": 0.019676623689076488, + "loss": 13.7443, + "mean_token_accuracy": 0.011816542595624924, + "num_tokens": 1068654536.0, + "step": 2091 + }, + { + "epoch": 0.5657111952406706, + "grad_norm": 10.1875, + "learning_rate": 0.019676204760230607, + "loss": 4.2761, + "mean_token_accuracy": 0.27346158027648926, + "num_tokens": 1069154724.0, + "step": 2092 + }, + { + "epoch": 0.5659816116819902, + "grad_norm": 3.65625, + "learning_rate": 0.019675785565173253, + "loss": 3.5307, + "mean_token_accuracy": 0.3543836176395416, + "num_tokens": 1069624897.0, + "step": 2093 + }, + { + "epoch": 0.5662520281233099, + "grad_norm": 3.390625, + "learning_rate": 0.019675366103917284, + "loss": 3.487, + "mean_token_accuracy": 0.336663156747818, + "num_tokens": 1070149143.0, + "step": 2094 + }, + { + "epoch": 0.5665224445646295, + "grad_norm": 3.90625, + "learning_rate": 0.019674946376475568, + "loss": 3.4984, + "mean_token_accuracy": 0.3701956868171692, + "num_tokens": 1070673212.0, + "step": 2095 + }, + { + "epoch": 0.5667928610059492, + "grad_norm": 3.609375, + "learning_rate": 0.019674526382860987, + "loss": 3.2253, + "mean_token_accuracy": 0.38736405968666077, + "num_tokens": 1071179900.0, + "step": 2096 + }, + { + "epoch": 0.5670632774472688, + "grad_norm": 2.859375, + "learning_rate": 0.019674106123086427, + "loss": 3.3349, + "mean_token_accuracy": 0.3396061360836029, + "num_tokens": 1071703980.0, + "step": 2097 + }, + { + "epoch": 0.5673336938885885, + "grad_norm": 2.28125, + "learning_rate": 0.019673685597164783, + "loss": 3.3656, + "mean_token_accuracy": 0.391640841960907, + "num_tokens": 1072166138.0, + "step": 2098 + }, + { + "epoch": 0.5676041103299081, + "grad_norm": 2.28125, + "learning_rate": 0.01967326480510895, + "loss": 3.3095, + "mean_token_accuracy": 0.3795500099658966, + "num_tokens": 1072690412.0, + "step": 2099 + }, + { + "epoch": 0.5678745267712277, + "grad_norm": 2.734375, + "learning_rate": 0.019672843746931856, + "loss": 2.9374, + "mean_token_accuracy": 0.41553324460983276, + "num_tokens": 1073166986.0, + "step": 2100 + }, + { + "epoch": 0.5681449432125473, + "grad_norm": 4.53125, + "learning_rate": 0.01967242242264641, + "loss": 3.5834, + "mean_token_accuracy": 0.35907745361328125, + "num_tokens": 1073653102.0, + "step": 2101 + }, + { + "epoch": 0.5684153596538669, + "grad_norm": 2.875, + "learning_rate": 0.019672000832265536, + "loss": 3.3615, + "mean_token_accuracy": 0.3761073350906372, + "num_tokens": 1074152792.0, + "step": 2102 + }, + { + "epoch": 0.5686857760951866, + "grad_norm": 3.421875, + "learning_rate": 0.019671578975802174, + "loss": 3.571, + "mean_token_accuracy": 0.3546082675457001, + "num_tokens": 1074676970.0, + "step": 2103 + }, + { + "epoch": 0.5689561925365062, + "grad_norm": 3.109375, + "learning_rate": 0.019671156853269266, + "loss": 3.5855, + "mean_token_accuracy": 0.3531642556190491, + "num_tokens": 1075201189.0, + "step": 2104 + }, + { + "epoch": 0.5692266089778258, + "grad_norm": 3.515625, + "learning_rate": 0.019670734464679773, + "loss": 3.2411, + "mean_token_accuracy": 0.3805479407310486, + "num_tokens": 1075725465.0, + "step": 2105 + }, + { + "epoch": 0.5694970254191455, + "grad_norm": 2.890625, + "learning_rate": 0.01967031181004664, + "loss": 3.4265, + "mean_token_accuracy": 0.3691132068634033, + "num_tokens": 1076249599.0, + "step": 2106 + }, + { + "epoch": 0.5697674418604651, + "grad_norm": 4.21875, + "learning_rate": 0.019669888889382845, + "loss": 3.607, + "mean_token_accuracy": 0.34453919529914856, + "num_tokens": 1076773737.0, + "step": 2107 + }, + { + "epoch": 0.5700378583017848, + "grad_norm": 2.484375, + "learning_rate": 0.019669465702701363, + "loss": 3.3091, + "mean_token_accuracy": 0.3771371841430664, + "num_tokens": 1077286053.0, + "step": 2108 + }, + { + "epoch": 0.5703082747431044, + "grad_norm": 3.15625, + "learning_rate": 0.01966904225001518, + "loss": 3.3359, + "mean_token_accuracy": 0.36515647172927856, + "num_tokens": 1077810264.0, + "step": 2109 + }, + { + "epoch": 0.5705786911844241, + "grad_norm": 2.46875, + "learning_rate": 0.019668618531337282, + "loss": 3.448, + "mean_token_accuracy": 0.37964820861816406, + "num_tokens": 1078334438.0, + "step": 2110 + }, + { + "epoch": 0.5708491076257436, + "grad_norm": 1336.0, + "learning_rate": 0.019668194546680676, + "loss": 33.9112, + "mean_token_accuracy": 0.0, + "num_tokens": 1078812288.0, + "step": 2111 + }, + { + "epoch": 0.5711195240670632, + "grad_norm": 8.9375, + "learning_rate": 0.019667770296058373, + "loss": 4.3138, + "mean_token_accuracy": 0.2624777555465698, + "num_tokens": 1079336415.0, + "step": 2112 + }, + { + "epoch": 0.5713899405083829, + "grad_norm": 3.515625, + "learning_rate": 0.019667345779483387, + "loss": 3.6898, + "mean_token_accuracy": 0.3455446660518646, + "num_tokens": 1079838808.0, + "step": 2113 + }, + { + "epoch": 0.5716603569497025, + "grad_norm": 4.46875, + "learning_rate": 0.01966692099696874, + "loss": 3.5196, + "mean_token_accuracy": 0.3660162687301636, + "num_tokens": 1080362899.0, + "step": 2114 + }, + { + "epoch": 0.5719307733910222, + "grad_norm": 2.84375, + "learning_rate": 0.01966649594852747, + "loss": 3.755, + "mean_token_accuracy": 0.33295345306396484, + "num_tokens": 1080867077.0, + "step": 2115 + }, + { + "epoch": 0.5722011898323418, + "grad_norm": 3.171875, + "learning_rate": 0.01966607063417262, + "loss": 3.578, + "mean_token_accuracy": 0.3460514545440674, + "num_tokens": 1081376230.0, + "step": 2116 + }, + { + "epoch": 0.5724716062736614, + "grad_norm": 3.453125, + "learning_rate": 0.019665645053917238, + "loss": 3.4026, + "mean_token_accuracy": 0.3935048580169678, + "num_tokens": 1081886836.0, + "step": 2117 + }, + { + "epoch": 0.5727420227149811, + "grad_norm": 3.375, + "learning_rate": 0.019665219207774382, + "loss": 3.6338, + "mean_token_accuracy": 0.3493550717830658, + "num_tokens": 1082411019.0, + "step": 2118 + }, + { + "epoch": 0.5730124391563007, + "grad_norm": 2.4375, + "learning_rate": 0.019664793095757114, + "loss": 3.4043, + "mean_token_accuracy": 0.38876044750213623, + "num_tokens": 1082898311.0, + "step": 2119 + }, + { + "epoch": 0.5732828555976204, + "grad_norm": 2.90625, + "learning_rate": 0.019664366717878513, + "loss": 3.4468, + "mean_token_accuracy": 0.3683164119720459, + "num_tokens": 1083422582.0, + "step": 2120 + }, + { + "epoch": 0.57355327203894, + "grad_norm": 2.59375, + "learning_rate": 0.019663940074151662, + "loss": 3.3932, + "mean_token_accuracy": 0.3658859133720398, + "num_tokens": 1083934843.0, + "step": 2121 + }, + { + "epoch": 0.5738236884802596, + "grad_norm": 2.375, + "learning_rate": 0.01966351316458965, + "loss": 3.3778, + "mean_token_accuracy": 0.36664000153541565, + "num_tokens": 1084401341.0, + "step": 2122 + }, + { + "epoch": 0.5740941049215792, + "grad_norm": 2.96875, + "learning_rate": 0.01966308598920558, + "loss": 3.1481, + "mean_token_accuracy": 0.3764857053756714, + "num_tokens": 1084925553.0, + "step": 2123 + }, + { + "epoch": 0.5743645213628988, + "grad_norm": 3.6875, + "learning_rate": 0.01966265854801255, + "loss": 3.5288, + "mean_token_accuracy": 0.3638278543949127, + "num_tokens": 1085442438.0, + "step": 2124 + }, + { + "epoch": 0.5746349378042185, + "grad_norm": 2.765625, + "learning_rate": 0.019662230841023684, + "loss": 3.5488, + "mean_token_accuracy": 0.360626757144928, + "num_tokens": 1085966719.0, + "step": 2125 + }, + { + "epoch": 0.5749053542455381, + "grad_norm": 2.6875, + "learning_rate": 0.019661802868252105, + "loss": 3.2739, + "mean_token_accuracy": 0.37019383907318115, + "num_tokens": 1086490313.0, + "step": 2126 + }, + { + "epoch": 0.5751757706868578, + "grad_norm": 2.5, + "learning_rate": 0.019661374629710935, + "loss": 3.2825, + "mean_token_accuracy": 0.37396395206451416, + "num_tokens": 1087014597.0, + "step": 2127 + }, + { + "epoch": 0.5754461871281774, + "grad_norm": 2.65625, + "learning_rate": 0.01966094612541332, + "loss": 3.2038, + "mean_token_accuracy": 0.3815508484840393, + "num_tokens": 1087538812.0, + "step": 2128 + }, + { + "epoch": 0.575716603569497, + "grad_norm": 3.140625, + "learning_rate": 0.01966051735537241, + "loss": 3.4197, + "mean_token_accuracy": 0.38871899247169495, + "num_tokens": 1088062936.0, + "step": 2129 + }, + { + "epoch": 0.5759870200108167, + "grad_norm": 2.40625, + "learning_rate": 0.01966008831960136, + "loss": 3.2687, + "mean_token_accuracy": 0.4128251075744629, + "num_tokens": 1088488611.0, + "step": 2130 + }, + { + "epoch": 0.5762574364521363, + "grad_norm": 179.0, + "learning_rate": 0.019659659018113333, + "loss": 12.799, + "mean_token_accuracy": 0.003090922487899661, + "num_tokens": 1089012802.0, + "step": 2131 + }, + { + "epoch": 0.5765278528934559, + "grad_norm": 7.28125, + "learning_rate": 0.019659229450921495, + "loss": 4.1713, + "mean_token_accuracy": 0.2780880331993103, + "num_tokens": 1089536926.0, + "step": 2132 + }, + { + "epoch": 0.5767982693347755, + "grad_norm": 2.390625, + "learning_rate": 0.019658799618039038, + "loss": 3.4961, + "mean_token_accuracy": 0.35587430000305176, + "num_tokens": 1090061203.0, + "step": 2133 + }, + { + "epoch": 0.5770686857760952, + "grad_norm": 2.734375, + "learning_rate": 0.019658369519479142, + "loss": 3.3186, + "mean_token_accuracy": 0.37439200282096863, + "num_tokens": 1090585423.0, + "step": 2134 + }, + { + "epoch": 0.5773391022174148, + "grad_norm": 3.296875, + "learning_rate": 0.019657939155255006, + "loss": 3.3256, + "mean_token_accuracy": 0.3721519708633423, + "num_tokens": 1091060720.0, + "step": 2135 + }, + { + "epoch": 0.5776095186587344, + "grad_norm": 3.046875, + "learning_rate": 0.01965750852537984, + "loss": 3.3846, + "mean_token_accuracy": 0.3711397051811218, + "num_tokens": 1091584870.0, + "step": 2136 + }, + { + "epoch": 0.5778799351000541, + "grad_norm": 2.9375, + "learning_rate": 0.01965707762986685, + "loss": 3.6078, + "mean_token_accuracy": 0.3424816429615021, + "num_tokens": 1092084543.0, + "step": 2137 + }, + { + "epoch": 0.5781503515413737, + "grad_norm": 2.515625, + "learning_rate": 0.01965664646872926, + "loss": 3.3665, + "mean_token_accuracy": 0.38100799918174744, + "num_tokens": 1092608542.0, + "step": 2138 + }, + { + "epoch": 0.5784207679826934, + "grad_norm": 2.921875, + "learning_rate": 0.019656215041980295, + "loss": 3.6156, + "mean_token_accuracy": 0.3499712347984314, + "num_tokens": 1093132817.0, + "step": 2139 + }, + { + "epoch": 0.578691184424013, + "grad_norm": 3.046875, + "learning_rate": 0.0196557833496332, + "loss": 3.3082, + "mean_token_accuracy": 0.36916860938072205, + "num_tokens": 1093637909.0, + "step": 2140 + }, + { + "epoch": 0.5789616008653327, + "grad_norm": 2.59375, + "learning_rate": 0.019655351391701218, + "loss": 3.3828, + "mean_token_accuracy": 0.38479626178741455, + "num_tokens": 1094156554.0, + "step": 2141 + }, + { + "epoch": 0.5792320173066522, + "grad_norm": 3.3125, + "learning_rate": 0.019654919168197598, + "loss": 3.4032, + "mean_token_accuracy": 0.3687770962715149, + "num_tokens": 1094680833.0, + "step": 2142 + }, + { + "epoch": 0.5795024337479718, + "grad_norm": 3.421875, + "learning_rate": 0.019654486679135608, + "loss": 3.4878, + "mean_token_accuracy": 0.3666474223136902, + "num_tokens": 1095205027.0, + "step": 2143 + }, + { + "epoch": 0.5797728501892915, + "grad_norm": 3.046875, + "learning_rate": 0.019654053924528514, + "loss": 3.5573, + "mean_token_accuracy": 0.3863561153411865, + "num_tokens": 1095636000.0, + "step": 2144 + }, + { + "epoch": 0.5800432666306111, + "grad_norm": 4.03125, + "learning_rate": 0.019653620904389598, + "loss": 3.6298, + "mean_token_accuracy": 0.3437790870666504, + "num_tokens": 1096156148.0, + "step": 2145 + }, + { + "epoch": 0.5803136830719308, + "grad_norm": 2.765625, + "learning_rate": 0.01965318761873214, + "loss": 3.4224, + "mean_token_accuracy": 0.39716070890426636, + "num_tokens": 1096616907.0, + "step": 2146 + }, + { + "epoch": 0.5805840995132504, + "grad_norm": 3.609375, + "learning_rate": 0.01965275406756944, + "loss": 3.3908, + "mean_token_accuracy": 0.37427017092704773, + "num_tokens": 1097141176.0, + "step": 2147 + }, + { + "epoch": 0.58085451595457, + "grad_norm": 3.078125, + "learning_rate": 0.0196523202509148, + "loss": 3.4326, + "mean_token_accuracy": 0.38101375102996826, + "num_tokens": 1097665433.0, + "step": 2148 + }, + { + "epoch": 0.5811249323958897, + "grad_norm": 3.75, + "learning_rate": 0.01965188616878153, + "loss": 3.2417, + "mean_token_accuracy": 0.37118667364120483, + "num_tokens": 1098189701.0, + "step": 2149 + }, + { + "epoch": 0.5813953488372093, + "grad_norm": 3.3125, + "learning_rate": 0.01965145182118295, + "loss": 3.4302, + "mean_token_accuracy": 0.35051196813583374, + "num_tokens": 1098713814.0, + "step": 2150 + }, + { + "epoch": 0.581665765278529, + "grad_norm": 60.5, + "learning_rate": 0.019651017208132383, + "loss": 11.9075, + "mean_token_accuracy": 0.025477692484855652, + "num_tokens": 1099205246.0, + "step": 2151 + }, + { + "epoch": 0.5819361817198486, + "grad_norm": 5.78125, + "learning_rate": 0.019650582329643167, + "loss": 3.8495, + "mean_token_accuracy": 0.29546988010406494, + "num_tokens": 1099729459.0, + "step": 2152 + }, + { + "epoch": 0.5822065981611682, + "grad_norm": 2.34375, + "learning_rate": 0.019650147185728647, + "loss": 3.5341, + "mean_token_accuracy": 0.35271933674812317, + "num_tokens": 1100253734.0, + "step": 2153 + }, + { + "epoch": 0.5824770146024878, + "grad_norm": 3.4375, + "learning_rate": 0.019649711776402172, + "loss": 3.475, + "mean_token_accuracy": 0.34770524501800537, + "num_tokens": 1100777886.0, + "step": 2154 + }, + { + "epoch": 0.5827474310438074, + "grad_norm": 3.015625, + "learning_rate": 0.019649276101677104, + "loss": 3.5384, + "mean_token_accuracy": 0.36256909370422363, + "num_tokens": 1101302113.0, + "step": 2155 + }, + { + "epoch": 0.5830178474851271, + "grad_norm": 4.53125, + "learning_rate": 0.01964884016156681, + "loss": 3.7585, + "mean_token_accuracy": 0.3528192639350891, + "num_tokens": 1101826395.0, + "step": 2156 + }, + { + "epoch": 0.5832882639264467, + "grad_norm": 4.0, + "learning_rate": 0.019648403956084665, + "loss": 3.5523, + "mean_token_accuracy": 0.3535737991333008, + "num_tokens": 1102313466.0, + "step": 2157 + }, + { + "epoch": 0.5835586803677664, + "grad_norm": 3.4375, + "learning_rate": 0.01964796748524405, + "loss": 3.5763, + "mean_token_accuracy": 0.38870832324028015, + "num_tokens": 1102784915.0, + "step": 2158 + }, + { + "epoch": 0.583829096809086, + "grad_norm": 3.65625, + "learning_rate": 0.019647530749058364, + "loss": 3.3224, + "mean_token_accuracy": 0.3531089127063751, + "num_tokens": 1103309188.0, + "step": 2159 + }, + { + "epoch": 0.5840995132504057, + "grad_norm": 2.296875, + "learning_rate": 0.019647093747541002, + "loss": 3.4284, + "mean_token_accuracy": 0.37186163663864136, + "num_tokens": 1103833451.0, + "step": 2160 + }, + { + "epoch": 0.5843699296917253, + "grad_norm": 2.96875, + "learning_rate": 0.019646656480705373, + "loss": 3.5169, + "mean_token_accuracy": 0.3610597252845764, + "num_tokens": 1104307192.0, + "step": 2161 + }, + { + "epoch": 0.5846403461330449, + "grad_norm": 2.859375, + "learning_rate": 0.019646218948564897, + "loss": 3.502, + "mean_token_accuracy": 0.3734223246574402, + "num_tokens": 1104831367.0, + "step": 2162 + }, + { + "epoch": 0.5849107625743645, + "grad_norm": 3.53125, + "learning_rate": 0.019645781151133, + "loss": 3.5032, + "mean_token_accuracy": 0.33493590354919434, + "num_tokens": 1105355527.0, + "step": 2163 + }, + { + "epoch": 0.5851811790156841, + "grad_norm": 2.4375, + "learning_rate": 0.019645343088423108, + "loss": 3.1841, + "mean_token_accuracy": 0.3887735903263092, + "num_tokens": 1105879627.0, + "step": 2164 + }, + { + "epoch": 0.5854515954570038, + "grad_norm": 2.859375, + "learning_rate": 0.019644904760448664, + "loss": 3.356, + "mean_token_accuracy": 0.35806789994239807, + "num_tokens": 1106403873.0, + "step": 2165 + }, + { + "epoch": 0.5857220118983234, + "grad_norm": 2.265625, + "learning_rate": 0.01964446616722312, + "loss": 3.4349, + "mean_token_accuracy": 0.33283329010009766, + "num_tokens": 1106928088.0, + "step": 2166 + }, + { + "epoch": 0.585992428339643, + "grad_norm": 3.53125, + "learning_rate": 0.019644027308759936, + "loss": 3.2014, + "mean_token_accuracy": 0.3631289601325989, + "num_tokens": 1107452213.0, + "step": 2167 + }, + { + "epoch": 0.5862628447809627, + "grad_norm": 2.703125, + "learning_rate": 0.019643588185072572, + "loss": 3.3564, + "mean_token_accuracy": 0.3830299973487854, + "num_tokens": 1107976483.0, + "step": 2168 + }, + { + "epoch": 0.5865332612222823, + "grad_norm": 3.140625, + "learning_rate": 0.0196431487961745, + "loss": 3.3073, + "mean_token_accuracy": 0.38511377573013306, + "num_tokens": 1108500759.0, + "step": 2169 + }, + { + "epoch": 0.586803677663602, + "grad_norm": 2.421875, + "learning_rate": 0.019642709142079207, + "loss": 3.3066, + "mean_token_accuracy": 0.4136609435081482, + "num_tokens": 1108961211.0, + "step": 2170 + }, + { + "epoch": 0.5870740941049216, + "grad_norm": 194.0, + "learning_rate": 0.01964226922280018, + "loss": 18.7036, + "mean_token_accuracy": 0.011066791601479053, + "num_tokens": 1109421021.0, + "step": 2171 + }, + { + "epoch": 0.5873445105462413, + "grad_norm": 7.625, + "learning_rate": 0.019641829038350926, + "loss": 4.0453, + "mean_token_accuracy": 0.3023015856742859, + "num_tokens": 1109945263.0, + "step": 2172 + }, + { + "epoch": 0.5876149269875608, + "grad_norm": 3.171875, + "learning_rate": 0.019641388588744934, + "loss": 3.3868, + "mean_token_accuracy": 0.3649759292602539, + "num_tokens": 1110409352.0, + "step": 2173 + }, + { + "epoch": 0.5878853434288804, + "grad_norm": 4.0625, + "learning_rate": 0.01964094787399573, + "loss": 3.2977, + "mean_token_accuracy": 0.3751930594444275, + "num_tokens": 1110933583.0, + "step": 2174 + }, + { + "epoch": 0.5881557598702001, + "grad_norm": 2.859375, + "learning_rate": 0.01964050689411683, + "loss": 3.4382, + "mean_token_accuracy": 0.38073495030403137, + "num_tokens": 1111388814.0, + "step": 2175 + }, + { + "epoch": 0.5884261763115197, + "grad_norm": 3.421875, + "learning_rate": 0.019640065649121775, + "loss": 3.4573, + "mean_token_accuracy": 0.3802885115146637, + "num_tokens": 1111889516.0, + "step": 2176 + }, + { + "epoch": 0.5886965927528394, + "grad_norm": 2.875, + "learning_rate": 0.019639624139024096, + "loss": 3.415, + "mean_token_accuracy": 0.3780483901500702, + "num_tokens": 1112380151.0, + "step": 2177 + }, + { + "epoch": 0.588967009194159, + "grad_norm": 2.65625, + "learning_rate": 0.019639182363837337, + "loss": 3.4973, + "mean_token_accuracy": 0.3670310378074646, + "num_tokens": 1112904282.0, + "step": 2178 + }, + { + "epoch": 0.5892374256354787, + "grad_norm": 3.171875, + "learning_rate": 0.01963874032357506, + "loss": 3.3493, + "mean_token_accuracy": 0.39737290143966675, + "num_tokens": 1113367310.0, + "step": 2179 + }, + { + "epoch": 0.5895078420767983, + "grad_norm": 3.375, + "learning_rate": 0.019638298018250823, + "loss": 3.303, + "mean_token_accuracy": 0.34648630023002625, + "num_tokens": 1113881509.0, + "step": 2180 + }, + { + "epoch": 0.5897782585181179, + "grad_norm": 2.96875, + "learning_rate": 0.019637855447878202, + "loss": 3.3846, + "mean_token_accuracy": 0.3656599521636963, + "num_tokens": 1114405725.0, + "step": 2181 + }, + { + "epoch": 0.5900486749594376, + "grad_norm": 3.1875, + "learning_rate": 0.019637412612470772, + "loss": 3.4752, + "mean_token_accuracy": 0.37200021743774414, + "num_tokens": 1114930007.0, + "step": 2182 + }, + { + "epoch": 0.5903190914007571, + "grad_norm": 3.0625, + "learning_rate": 0.01963696951204212, + "loss": 3.2413, + "mean_token_accuracy": 0.39686280488967896, + "num_tokens": 1115418365.0, + "step": 2183 + }, + { + "epoch": 0.5905895078420768, + "grad_norm": 3.578125, + "learning_rate": 0.01963652614660585, + "loss": 3.4126, + "mean_token_accuracy": 0.372814416885376, + "num_tokens": 1115933691.0, + "step": 2184 + }, + { + "epoch": 0.5908599242833964, + "grad_norm": 3.8125, + "learning_rate": 0.019636082516175556, + "loss": 3.6609, + "mean_token_accuracy": 0.36641785502433777, + "num_tokens": 1116451988.0, + "step": 2185 + }, + { + "epoch": 0.591130340724716, + "grad_norm": 2.5625, + "learning_rate": 0.01963563862076485, + "loss": 3.4159, + "mean_token_accuracy": 0.377379834651947, + "num_tokens": 1116976217.0, + "step": 2186 + }, + { + "epoch": 0.5914007571660357, + "grad_norm": 2.421875, + "learning_rate": 0.01963519446038736, + "loss": 3.2451, + "mean_token_accuracy": 0.3670634925365448, + "num_tokens": 1117500194.0, + "step": 2187 + }, + { + "epoch": 0.5916711736073553, + "grad_norm": 2.890625, + "learning_rate": 0.019634750035056707, + "loss": 3.2629, + "mean_token_accuracy": 0.3630295991897583, + "num_tokens": 1118024304.0, + "step": 2188 + }, + { + "epoch": 0.591941590048675, + "grad_norm": 3.484375, + "learning_rate": 0.01963430534478653, + "loss": 3.2759, + "mean_token_accuracy": 0.3768152892589569, + "num_tokens": 1118548497.0, + "step": 2189 + }, + { + "epoch": 0.5922120064899946, + "grad_norm": 3.0625, + "learning_rate": 0.019633860389590474, + "loss": 3.1597, + "mean_token_accuracy": 0.38329580426216125, + "num_tokens": 1119042561.0, + "step": 2190 + }, + { + "epoch": 0.5924824229313143, + "grad_norm": 133.0, + "learning_rate": 0.01963341516948219, + "loss": 12.231, + "mean_token_accuracy": 0.004626275040209293, + "num_tokens": 1119566743.0, + "step": 2191 + }, + { + "epoch": 0.5927528393726339, + "grad_norm": 8.875, + "learning_rate": 0.019632969684475338, + "loss": 4.3273, + "mean_token_accuracy": 0.2941206693649292, + "num_tokens": 1120054759.0, + "step": 2192 + }, + { + "epoch": 0.5930232558139535, + "grad_norm": 2.34375, + "learning_rate": 0.019632523934583593, + "loss": 3.6151, + "mean_token_accuracy": 0.33893725275993347, + "num_tokens": 1120579018.0, + "step": 2193 + }, + { + "epoch": 0.5932936722552731, + "grad_norm": 2.5625, + "learning_rate": 0.019632077919820624, + "loss": 3.5388, + "mean_token_accuracy": 0.3680573105812073, + "num_tokens": 1121102621.0, + "step": 2194 + }, + { + "epoch": 0.5935640886965927, + "grad_norm": 3.046875, + "learning_rate": 0.019631631640200115, + "loss": 3.4882, + "mean_token_accuracy": 0.362493634223938, + "num_tokens": 1121626872.0, + "step": 2195 + }, + { + "epoch": 0.5938345051379124, + "grad_norm": 2.859375, + "learning_rate": 0.01963118509573577, + "loss": 3.1843, + "mean_token_accuracy": 0.3895723223686218, + "num_tokens": 1122151142.0, + "step": 2196 + }, + { + "epoch": 0.594104921579232, + "grad_norm": 2.625, + "learning_rate": 0.019630738286441282, + "loss": 3.1498, + "mean_token_accuracy": 0.39602625370025635, + "num_tokens": 1122675241.0, + "step": 2197 + }, + { + "epoch": 0.5943753380205516, + "grad_norm": 2.59375, + "learning_rate": 0.01963029121233036, + "loss": 3.3428, + "mean_token_accuracy": 0.38280609250068665, + "num_tokens": 1123199498.0, + "step": 2198 + }, + { + "epoch": 0.5946457544618713, + "grad_norm": 5.96875, + "learning_rate": 0.019629843873416725, + "loss": 3.4047, + "mean_token_accuracy": 0.37359029054641724, + "num_tokens": 1123689134.0, + "step": 2199 + }, + { + "epoch": 0.5949161709031909, + "grad_norm": 3.390625, + "learning_rate": 0.0196293962697141, + "loss": 3.3629, + "mean_token_accuracy": 0.3580431640148163, + "num_tokens": 1124176044.0, + "step": 2200 + }, + { + "epoch": 0.5951865873445106, + "grad_norm": 3.5625, + "learning_rate": 0.019628948401236218, + "loss": 3.4718, + "mean_token_accuracy": 0.3725981116294861, + "num_tokens": 1124700319.0, + "step": 2201 + }, + { + "epoch": 0.5954570037858302, + "grad_norm": 3.484375, + "learning_rate": 0.01962850026799683, + "loss": 3.3254, + "mean_token_accuracy": 0.3470418155193329, + "num_tokens": 1125224422.0, + "step": 2202 + }, + { + "epoch": 0.5957274202271499, + "grad_norm": 3.203125, + "learning_rate": 0.019628051870009672, + "loss": 3.3803, + "mean_token_accuracy": 0.3751794993877411, + "num_tokens": 1125748701.0, + "step": 2203 + }, + { + "epoch": 0.5959978366684694, + "grad_norm": 3.5, + "learning_rate": 0.019627603207288513, + "loss": 3.4558, + "mean_token_accuracy": 0.3628745675086975, + "num_tokens": 1126272841.0, + "step": 2204 + }, + { + "epoch": 0.596268253109789, + "grad_norm": 2.359375, + "learning_rate": 0.019627154279847115, + "loss": 3.3198, + "mean_token_accuracy": 0.36028504371643066, + "num_tokens": 1126797025.0, + "step": 2205 + }, + { + "epoch": 0.5965386695511087, + "grad_norm": 3.140625, + "learning_rate": 0.019626705087699252, + "loss": 3.6364, + "mean_token_accuracy": 0.3493250608444214, + "num_tokens": 1127321286.0, + "step": 2206 + }, + { + "epoch": 0.5968090859924283, + "grad_norm": 3.265625, + "learning_rate": 0.019626255630858708, + "loss": 3.4348, + "mean_token_accuracy": 0.3723739981651306, + "num_tokens": 1127816421.0, + "step": 2207 + }, + { + "epoch": 0.597079502433748, + "grad_norm": 2.984375, + "learning_rate": 0.019625805909339274, + "loss": 3.3827, + "mean_token_accuracy": 0.3679787218570709, + "num_tokens": 1128340601.0, + "step": 2208 + }, + { + "epoch": 0.5973499188750676, + "grad_norm": 3.03125, + "learning_rate": 0.019625355923154744, + "loss": 3.4195, + "mean_token_accuracy": 0.35531505942344666, + "num_tokens": 1128864758.0, + "step": 2209 + }, + { + "epoch": 0.5976203353163873, + "grad_norm": 2.609375, + "learning_rate": 0.01962490567231893, + "loss": 3.2835, + "mean_token_accuracy": 0.3818522095680237, + "num_tokens": 1129389038.0, + "step": 2210 + }, + { + "epoch": 0.5978907517577069, + "grad_norm": 103.0, + "learning_rate": 0.01962445515684565, + "loss": 13.3483, + "mean_token_accuracy": 0.005431059747934341, + "num_tokens": 1129913239.0, + "step": 2211 + }, + { + "epoch": 0.5981611681990265, + "grad_norm": 6.78125, + "learning_rate": 0.01962400437674872, + "loss": 3.8784, + "mean_token_accuracy": 0.2686183452606201, + "num_tokens": 1130393778.0, + "step": 2212 + }, + { + "epoch": 0.5984315846403462, + "grad_norm": 2.6875, + "learning_rate": 0.019623553332041976, + "loss": 3.5051, + "mean_token_accuracy": 0.34121161699295044, + "num_tokens": 1130918026.0, + "step": 2213 + }, + { + "epoch": 0.5987020010816657, + "grad_norm": 3.8125, + "learning_rate": 0.019623102022739255, + "loss": 3.4392, + "mean_token_accuracy": 0.4027640223503113, + "num_tokens": 1131363575.0, + "step": 2214 + }, + { + "epoch": 0.5989724175229854, + "grad_norm": 2.3125, + "learning_rate": 0.0196226504488544, + "loss": 3.3361, + "mean_token_accuracy": 0.3839018940925598, + "num_tokens": 1131887748.0, + "step": 2215 + }, + { + "epoch": 0.599242833964305, + "grad_norm": 2.96875, + "learning_rate": 0.019622198610401275, + "loss": 3.4813, + "mean_token_accuracy": 0.35319948196411133, + "num_tokens": 1132411957.0, + "step": 2216 + }, + { + "epoch": 0.5995132504056246, + "grad_norm": 2.125, + "learning_rate": 0.019621746507393744, + "loss": 3.4725, + "mean_token_accuracy": 0.37390220165252686, + "num_tokens": 1132902341.0, + "step": 2217 + }, + { + "epoch": 0.5997836668469443, + "grad_norm": 2.546875, + "learning_rate": 0.01962129413984567, + "loss": 3.3981, + "mean_token_accuracy": 0.35985761880874634, + "num_tokens": 1133426567.0, + "step": 2218 + }, + { + "epoch": 0.6000540832882639, + "grad_norm": 2.703125, + "learning_rate": 0.019620841507770943, + "loss": 3.3548, + "mean_token_accuracy": 0.3964836597442627, + "num_tokens": 1133889846.0, + "step": 2219 + }, + { + "epoch": 0.6003244997295836, + "grad_norm": 3.140625, + "learning_rate": 0.019620388611183443, + "loss": 3.4567, + "mean_token_accuracy": 0.3641844391822815, + "num_tokens": 1134373479.0, + "step": 2220 + }, + { + "epoch": 0.6005949161709032, + "grad_norm": 3.3125, + "learning_rate": 0.019619935450097072, + "loss": 3.5327, + "mean_token_accuracy": 0.3655785322189331, + "num_tokens": 1134897713.0, + "step": 2221 + }, + { + "epoch": 0.6008653326122229, + "grad_norm": 2.484375, + "learning_rate": 0.01961948202452573, + "loss": 3.1609, + "mean_token_accuracy": 0.40079227089881897, + "num_tokens": 1135421914.0, + "step": 2222 + }, + { + "epoch": 0.6011357490535425, + "grad_norm": 3.09375, + "learning_rate": 0.01961902833448333, + "loss": 3.1352, + "mean_token_accuracy": 0.35404831171035767, + "num_tokens": 1135946125.0, + "step": 2223 + }, + { + "epoch": 0.601406165494862, + "grad_norm": 2.921875, + "learning_rate": 0.019618574379983794, + "loss": 3.2577, + "mean_token_accuracy": 0.37661775946617126, + "num_tokens": 1136470374.0, + "step": 2224 + }, + { + "epoch": 0.6016765819361817, + "grad_norm": 4.15625, + "learning_rate": 0.01961812016104105, + "loss": 3.3131, + "mean_token_accuracy": 0.3763008117675781, + "num_tokens": 1136994654.0, + "step": 2225 + }, + { + "epoch": 0.6019469983775013, + "grad_norm": 2.890625, + "learning_rate": 0.01961766567766904, + "loss": 3.199, + "mean_token_accuracy": 0.3885895609855652, + "num_tokens": 1137518829.0, + "step": 2226 + }, + { + "epoch": 0.602217414818821, + "grad_norm": 2.65625, + "learning_rate": 0.0196172109298817, + "loss": 3.1079, + "mean_token_accuracy": 0.38509035110473633, + "num_tokens": 1138043014.0, + "step": 2227 + }, + { + "epoch": 0.6024878312601406, + "grad_norm": 3.359375, + "learning_rate": 0.019616755917692987, + "loss": 3.4435, + "mean_token_accuracy": 0.37947165966033936, + "num_tokens": 1138567222.0, + "step": 2228 + }, + { + "epoch": 0.6027582477014602, + "grad_norm": 2.96875, + "learning_rate": 0.019616300641116864, + "loss": 3.2639, + "mean_token_accuracy": 0.3734496235847473, + "num_tokens": 1139091463.0, + "step": 2229 + }, + { + "epoch": 0.6030286641427799, + "grad_norm": 3.21875, + "learning_rate": 0.019615845100167298, + "loss": 3.5528, + "mean_token_accuracy": 0.3657757639884949, + "num_tokens": 1139615721.0, + "step": 2230 + }, + { + "epoch": 0.6032990805840995, + "grad_norm": 78.0, + "learning_rate": 0.019615389294858267, + "loss": 11.1342, + "mean_token_accuracy": 0.004550575744360685, + "num_tokens": 1140084108.0, + "step": 2231 + }, + { + "epoch": 0.6035694970254192, + "grad_norm": 6.9375, + "learning_rate": 0.019614933225203753, + "loss": 4.1581, + "mean_token_accuracy": 0.27379846572875977, + "num_tokens": 1140584771.0, + "step": 2232 + }, + { + "epoch": 0.6038399134667388, + "grad_norm": 3.109375, + "learning_rate": 0.019614476891217755, + "loss": 3.6788, + "mean_token_accuracy": 0.35655367374420166, + "num_tokens": 1141108981.0, + "step": 2233 + }, + { + "epoch": 0.6041103299080585, + "grad_norm": 3.796875, + "learning_rate": 0.019614020292914276, + "loss": 3.5756, + "mean_token_accuracy": 0.3574933409690857, + "num_tokens": 1141633138.0, + "step": 2234 + }, + { + "epoch": 0.604380746349378, + "grad_norm": 4.8125, + "learning_rate": 0.019613563430307318, + "loss": 3.0431, + "mean_token_accuracy": 0.3990272283554077, + "num_tokens": 1142121298.0, + "step": 2235 + }, + { + "epoch": 0.6046511627906976, + "grad_norm": 2.84375, + "learning_rate": 0.019613106303410902, + "loss": 3.5169, + "mean_token_accuracy": 0.3606972098350525, + "num_tokens": 1142645368.0, + "step": 2236 + }, + { + "epoch": 0.6049215792320173, + "grad_norm": 4.15625, + "learning_rate": 0.01961264891223906, + "loss": 3.5327, + "mean_token_accuracy": 0.35339322686195374, + "num_tokens": 1143169648.0, + "step": 2237 + }, + { + "epoch": 0.6051919956733369, + "grad_norm": 2.671875, + "learning_rate": 0.019612191256805817, + "loss": 3.3304, + "mean_token_accuracy": 0.3701842129230499, + "num_tokens": 1143682898.0, + "step": 2238 + }, + { + "epoch": 0.6054624121146566, + "grad_norm": 3.390625, + "learning_rate": 0.019611733337125223, + "loss": 3.6049, + "mean_token_accuracy": 0.35214462876319885, + "num_tokens": 1144207143.0, + "step": 2239 + }, + { + "epoch": 0.6057328285559762, + "grad_norm": 2.921875, + "learning_rate": 0.01961127515321132, + "loss": 3.5308, + "mean_token_accuracy": 0.37613850831985474, + "num_tokens": 1144701227.0, + "step": 2240 + }, + { + "epoch": 0.6060032449972959, + "grad_norm": 4.8125, + "learning_rate": 0.019610816705078173, + "loss": 3.3211, + "mean_token_accuracy": 0.35777604579925537, + "num_tokens": 1145225476.0, + "step": 2241 + }, + { + "epoch": 0.6062736614386155, + "grad_norm": 2.609375, + "learning_rate": 0.01961035799273985, + "loss": 3.5411, + "mean_token_accuracy": 0.3621812164783478, + "num_tokens": 1145749695.0, + "step": 2242 + }, + { + "epoch": 0.6065440778799351, + "grad_norm": 3.171875, + "learning_rate": 0.019609899016210422, + "loss": 3.6, + "mean_token_accuracy": 0.36679497361183167, + "num_tokens": 1146273868.0, + "step": 2243 + }, + { + "epoch": 0.6068144943212548, + "grad_norm": 2.875, + "learning_rate": 0.01960943977550397, + "loss": 3.6622, + "mean_token_accuracy": 0.3438524901866913, + "num_tokens": 1146798034.0, + "step": 2244 + }, + { + "epoch": 0.6070849107625743, + "grad_norm": 5.53125, + "learning_rate": 0.019608980270634586, + "loss": 3.6794, + "mean_token_accuracy": 0.34254294633865356, + "num_tokens": 1147322303.0, + "step": 2245 + }, + { + "epoch": 0.607355327203894, + "grad_norm": 2.296875, + "learning_rate": 0.01960852050161637, + "loss": 3.2569, + "mean_token_accuracy": 0.3913373649120331, + "num_tokens": 1147820000.0, + "step": 2246 + }, + { + "epoch": 0.6076257436452136, + "grad_norm": 3.25, + "learning_rate": 0.01960806046846343, + "loss": 3.5684, + "mean_token_accuracy": 0.3518795073032379, + "num_tokens": 1148304229.0, + "step": 2247 + }, + { + "epoch": 0.6078961600865332, + "grad_norm": 2.875, + "learning_rate": 0.019607600171189878, + "loss": 3.4826, + "mean_token_accuracy": 0.38256070017814636, + "num_tokens": 1148820904.0, + "step": 2248 + }, + { + "epoch": 0.6081665765278529, + "grad_norm": 3.3125, + "learning_rate": 0.019607139609809844, + "loss": 3.2799, + "mean_token_accuracy": 0.37723392248153687, + "num_tokens": 1149339586.0, + "step": 2249 + }, + { + "epoch": 0.6084369929691725, + "grad_norm": 3.59375, + "learning_rate": 0.01960667878433745, + "loss": 3.4788, + "mean_token_accuracy": 0.3302430212497711, + "num_tokens": 1149863865.0, + "step": 2250 + }, + { + "epoch": 0.6087074094104922, + "grad_norm": 39.5, + "learning_rate": 0.019606217694786845, + "loss": 12.0318, + "mean_token_accuracy": 0.009824084118008614, + "num_tokens": 1150388080.0, + "step": 2251 + }, + { + "epoch": 0.6089778258518118, + "grad_norm": 7.5625, + "learning_rate": 0.019605756341172172, + "loss": 4.2188, + "mean_token_accuracy": 0.30847734212875366, + "num_tokens": 1150912318.0, + "step": 2252 + }, + { + "epoch": 0.6092482422931315, + "grad_norm": 2.609375, + "learning_rate": 0.01960529472350758, + "loss": 3.5811, + "mean_token_accuracy": 0.34717804193496704, + "num_tokens": 1151436495.0, + "step": 2253 + }, + { + "epoch": 0.6095186587344511, + "grad_norm": 14.0625, + "learning_rate": 0.019604832841807248, + "loss": 3.5975, + "mean_token_accuracy": 0.3751227557659149, + "num_tokens": 1151960695.0, + "step": 2254 + }, + { + "epoch": 0.6097890751757706, + "grad_norm": 3.28125, + "learning_rate": 0.01960437069608533, + "loss": 3.6915, + "mean_token_accuracy": 0.33034783601760864, + "num_tokens": 1152484974.0, + "step": 2255 + }, + { + "epoch": 0.6100594916170903, + "grad_norm": 2.46875, + "learning_rate": 0.019603908286356022, + "loss": 3.48, + "mean_token_accuracy": 0.34959399700164795, + "num_tokens": 1153009233.0, + "step": 2256 + }, + { + "epoch": 0.6103299080584099, + "grad_norm": 4.25, + "learning_rate": 0.019603445612633504, + "loss": 3.5933, + "mean_token_accuracy": 0.3424175977706909, + "num_tokens": 1153533505.0, + "step": 2257 + }, + { + "epoch": 0.6106003244997296, + "grad_norm": 2.90625, + "learning_rate": 0.019602982674931973, + "loss": 3.4107, + "mean_token_accuracy": 0.38014572858810425, + "num_tokens": 1154022658.0, + "step": 2258 + }, + { + "epoch": 0.6108707409410492, + "grad_norm": 2.9375, + "learning_rate": 0.019602519473265634, + "loss": 3.2934, + "mean_token_accuracy": 0.3704500198364258, + "num_tokens": 1154546734.0, + "step": 2259 + }, + { + "epoch": 0.6111411573823688, + "grad_norm": 2.421875, + "learning_rate": 0.019602056007648697, + "loss": 3.5267, + "mean_token_accuracy": 0.38500267267227173, + "num_tokens": 1155070927.0, + "step": 2260 + }, + { + "epoch": 0.6114115738236885, + "grad_norm": 3.078125, + "learning_rate": 0.019601592278095388, + "loss": 3.3336, + "mean_token_accuracy": 0.3666079640388489, + "num_tokens": 1155573976.0, + "step": 2261 + }, + { + "epoch": 0.6116819902650081, + "grad_norm": 2.859375, + "learning_rate": 0.01960112828461993, + "loss": 3.5027, + "mean_token_accuracy": 0.38518184423446655, + "num_tokens": 1156098113.0, + "step": 2262 + }, + { + "epoch": 0.6119524067063278, + "grad_norm": 3.265625, + "learning_rate": 0.019600664027236564, + "loss": 3.5416, + "mean_token_accuracy": 0.356559157371521, + "num_tokens": 1156595435.0, + "step": 2263 + }, + { + "epoch": 0.6122228231476474, + "grad_norm": 2.6875, + "learning_rate": 0.01960019950595953, + "loss": 3.4281, + "mean_token_accuracy": 0.35329297184944153, + "num_tokens": 1157119505.0, + "step": 2264 + }, + { + "epoch": 0.6124932395889671, + "grad_norm": 3.4375, + "learning_rate": 0.019599734720803088, + "loss": 3.5403, + "mean_token_accuracy": 0.3410792350769043, + "num_tokens": 1157643750.0, + "step": 2265 + }, + { + "epoch": 0.6127636560302866, + "grad_norm": 2.546875, + "learning_rate": 0.01959926967178149, + "loss": 3.3886, + "mean_token_accuracy": 0.36767131090164185, + "num_tokens": 1158167956.0, + "step": 2266 + }, + { + "epoch": 0.6130340724716062, + "grad_norm": 2.796875, + "learning_rate": 0.01959880435890901, + "loss": 3.304, + "mean_token_accuracy": 0.38343894481658936, + "num_tokens": 1158681414.0, + "step": 2267 + }, + { + "epoch": 0.6133044889129259, + "grad_norm": 3.8125, + "learning_rate": 0.01959833878219993, + "loss": 3.3296, + "mean_token_accuracy": 0.3453894853591919, + "num_tokens": 1159205611.0, + "step": 2268 + }, + { + "epoch": 0.6135749053542455, + "grad_norm": 3.21875, + "learning_rate": 0.019597872941668523, + "loss": 3.3338, + "mean_token_accuracy": 0.36989399790763855, + "num_tokens": 1159729693.0, + "step": 2269 + }, + { + "epoch": 0.6138453217955652, + "grad_norm": 4.1875, + "learning_rate": 0.019597406837329096, + "loss": 3.5777, + "mean_token_accuracy": 0.368315726518631, + "num_tokens": 1160253920.0, + "step": 2270 + }, + { + "epoch": 0.6141157382368848, + "grad_norm": 179.0, + "learning_rate": 0.01959694046919594, + "loss": 13.0023, + "mean_token_accuracy": 0.0005733149591833353, + "num_tokens": 1160755144.0, + "step": 2271 + }, + { + "epoch": 0.6143861546782045, + "grad_norm": 6.28125, + "learning_rate": 0.01959647383728337, + "loss": 3.7433, + "mean_token_accuracy": 0.3135957419872284, + "num_tokens": 1161279312.0, + "step": 2272 + }, + { + "epoch": 0.6146565711195241, + "grad_norm": 2.421875, + "learning_rate": 0.0195960069416057, + "loss": 3.6344, + "mean_token_accuracy": 0.3474304676055908, + "num_tokens": 1161778623.0, + "step": 2273 + }, + { + "epoch": 0.6149269875608437, + "grad_norm": 3.015625, + "learning_rate": 0.01959553978217726, + "loss": 3.5682, + "mean_token_accuracy": 0.35206443071365356, + "num_tokens": 1162302900.0, + "step": 2274 + }, + { + "epoch": 0.6151974040021634, + "grad_norm": 2.890625, + "learning_rate": 0.019595072359012377, + "loss": 3.5051, + "mean_token_accuracy": 0.3638843894004822, + "num_tokens": 1162827153.0, + "step": 2275 + }, + { + "epoch": 0.6154678204434829, + "grad_norm": 3.5, + "learning_rate": 0.019594604672125406, + "loss": 3.4267, + "mean_token_accuracy": 0.39190614223480225, + "num_tokens": 1163286058.0, + "step": 2276 + }, + { + "epoch": 0.6157382368848026, + "grad_norm": 3.265625, + "learning_rate": 0.019594136721530684, + "loss": 3.4492, + "mean_token_accuracy": 0.3775593340396881, + "num_tokens": 1163704896.0, + "step": 2277 + }, + { + "epoch": 0.6160086533261222, + "grad_norm": 2.78125, + "learning_rate": 0.01959366850724257, + "loss": 3.3228, + "mean_token_accuracy": 0.3827451467514038, + "num_tokens": 1164229104.0, + "step": 2278 + }, + { + "epoch": 0.6162790697674418, + "grad_norm": 3.578125, + "learning_rate": 0.01959320002927544, + "loss": 3.191, + "mean_token_accuracy": 0.3781590759754181, + "num_tokens": 1164734465.0, + "step": 2279 + }, + { + "epoch": 0.6165494862087615, + "grad_norm": 3.109375, + "learning_rate": 0.019592731287643658, + "loss": 3.2905, + "mean_token_accuracy": 0.3814678192138672, + "num_tokens": 1165258647.0, + "step": 2280 + }, + { + "epoch": 0.6168199026500811, + "grad_norm": 2.78125, + "learning_rate": 0.019592262282361612, + "loss": 3.3938, + "mean_token_accuracy": 0.375718891620636, + "num_tokens": 1165732816.0, + "step": 2281 + }, + { + "epoch": 0.6170903190914008, + "grad_norm": 4.1875, + "learning_rate": 0.019591793013443694, + "loss": 3.5125, + "mean_token_accuracy": 0.3603648841381073, + "num_tokens": 1166257088.0, + "step": 2282 + }, + { + "epoch": 0.6173607355327204, + "grad_norm": 3.34375, + "learning_rate": 0.019591323480904293, + "loss": 3.7112, + "mean_token_accuracy": 0.34631314873695374, + "num_tokens": 1166781285.0, + "step": 2283 + }, + { + "epoch": 0.6176311519740401, + "grad_norm": 3.953125, + "learning_rate": 0.01959085368475783, + "loss": 3.5699, + "mean_token_accuracy": 0.40533921122550964, + "num_tokens": 1167240622.0, + "step": 2284 + }, + { + "epoch": 0.6179015684153597, + "grad_norm": 2.65625, + "learning_rate": 0.019590383625018706, + "loss": 3.4243, + "mean_token_accuracy": 0.3726138770580292, + "num_tokens": 1167764867.0, + "step": 2285 + }, + { + "epoch": 0.6181719848566792, + "grad_norm": 2.90625, + "learning_rate": 0.019589913301701352, + "loss": 3.4103, + "mean_token_accuracy": 0.387414813041687, + "num_tokens": 1168234512.0, + "step": 2286 + }, + { + "epoch": 0.6184424012979989, + "grad_norm": 2.6875, + "learning_rate": 0.019589442714820195, + "loss": 3.3518, + "mean_token_accuracy": 0.37158697843551636, + "num_tokens": 1168758701.0, + "step": 2287 + }, + { + "epoch": 0.6187128177393185, + "grad_norm": 4.15625, + "learning_rate": 0.019588971864389675, + "loss": 3.4281, + "mean_token_accuracy": 0.3706253170967102, + "num_tokens": 1169241990.0, + "step": 2288 + }, + { + "epoch": 0.6189832341806382, + "grad_norm": 2.5, + "learning_rate": 0.019588500750424242, + "loss": 3.293, + "mean_token_accuracy": 0.3856116533279419, + "num_tokens": 1169711522.0, + "step": 2289 + }, + { + "epoch": 0.6192536506219578, + "grad_norm": 2.796875, + "learning_rate": 0.01958802937293835, + "loss": 3.2202, + "mean_token_accuracy": 0.3824197053909302, + "num_tokens": 1170235741.0, + "step": 2290 + }, + { + "epoch": 0.6195240670632775, + "grad_norm": 44.5, + "learning_rate": 0.01958755773194646, + "loss": 11.8931, + "mean_token_accuracy": 0.006902322638779879, + "num_tokens": 1170759849.0, + "step": 2291 + }, + { + "epoch": 0.6197944835045971, + "grad_norm": 6.34375, + "learning_rate": 0.01958708582746304, + "loss": 3.7447, + "mean_token_accuracy": 0.3225902318954468, + "num_tokens": 1171284073.0, + "step": 2292 + }, + { + "epoch": 0.6200648999459167, + "grad_norm": 1.9375, + "learning_rate": 0.01958661365950258, + "loss": 3.4676, + "mean_token_accuracy": 0.3661736249923706, + "num_tokens": 1171781474.0, + "step": 2293 + }, + { + "epoch": 0.6203353163872364, + "grad_norm": 3.171875, + "learning_rate": 0.019586141228079557, + "loss": 3.3463, + "mean_token_accuracy": 0.3637107014656067, + "num_tokens": 1172305583.0, + "step": 2294 + }, + { + "epoch": 0.620605732828556, + "grad_norm": 2.96875, + "learning_rate": 0.019585668533208474, + "loss": 3.3639, + "mean_token_accuracy": 0.3775003254413605, + "num_tokens": 1172829853.0, + "step": 2295 + }, + { + "epoch": 0.6208761492698756, + "grad_norm": 3.25, + "learning_rate": 0.019585195574903833, + "loss": 3.2925, + "mean_token_accuracy": 0.3824797570705414, + "num_tokens": 1173334488.0, + "step": 2296 + }, + { + "epoch": 0.6211465657111952, + "grad_norm": 3.8125, + "learning_rate": 0.01958472235318014, + "loss": 3.3896, + "mean_token_accuracy": 0.35798487067222595, + "num_tokens": 1173858716.0, + "step": 2297 + }, + { + "epoch": 0.6214169821525148, + "grad_norm": 3.40625, + "learning_rate": 0.01958424886805192, + "loss": 3.3043, + "mean_token_accuracy": 0.38774558901786804, + "num_tokens": 1174382905.0, + "step": 2298 + }, + { + "epoch": 0.6216873985938345, + "grad_norm": 3.0, + "learning_rate": 0.0195837751195337, + "loss": 3.431, + "mean_token_accuracy": 0.3556728959083557, + "num_tokens": 1174876497.0, + "step": 2299 + }, + { + "epoch": 0.6219578150351541, + "grad_norm": 2.8125, + "learning_rate": 0.019583301107640016, + "loss": 3.6775, + "mean_token_accuracy": 0.3535316586494446, + "num_tokens": 1175400670.0, + "step": 2300 + }, + { + "epoch": 0.6222282314764738, + "grad_norm": 3.109375, + "learning_rate": 0.019582826832385415, + "loss": 3.5688, + "mean_token_accuracy": 0.3531768321990967, + "num_tokens": 1175924950.0, + "step": 2301 + }, + { + "epoch": 0.6224986479177934, + "grad_norm": 2.25, + "learning_rate": 0.019582352293784445, + "loss": 3.3396, + "mean_token_accuracy": 0.384897917509079, + "num_tokens": 1176414080.0, + "step": 2302 + }, + { + "epoch": 0.6227690643591131, + "grad_norm": 4.4375, + "learning_rate": 0.019581877491851665, + "loss": 3.2329, + "mean_token_accuracy": 0.4038589596748352, + "num_tokens": 1176938353.0, + "step": 2303 + }, + { + "epoch": 0.6230394808004327, + "grad_norm": 2.8125, + "learning_rate": 0.01958140242660165, + "loss": 3.1207, + "mean_token_accuracy": 0.3827863931655884, + "num_tokens": 1177462537.0, + "step": 2304 + }, + { + "epoch": 0.6233098972417523, + "grad_norm": 2.90625, + "learning_rate": 0.01958092709804897, + "loss": 3.4797, + "mean_token_accuracy": 0.3626502752304077, + "num_tokens": 1177986769.0, + "step": 2305 + }, + { + "epoch": 0.623580313683072, + "grad_norm": 3.109375, + "learning_rate": 0.019580451506208213, + "loss": 3.5197, + "mean_token_accuracy": 0.3647712469100952, + "num_tokens": 1178504384.0, + "step": 2306 + }, + { + "epoch": 0.6238507301243915, + "grad_norm": 2.484375, + "learning_rate": 0.01957997565109397, + "loss": 3.3488, + "mean_token_accuracy": 0.3691715896129608, + "num_tokens": 1178978373.0, + "step": 2307 + }, + { + "epoch": 0.6241211465657112, + "grad_norm": 3.078125, + "learning_rate": 0.01957949953272084, + "loss": 3.1548, + "mean_token_accuracy": 0.3658144176006317, + "num_tokens": 1179502548.0, + "step": 2308 + }, + { + "epoch": 0.6243915630070308, + "grad_norm": 2.109375, + "learning_rate": 0.019579023151103433, + "loss": 3.4787, + "mean_token_accuracy": 0.3680555820465088, + "num_tokens": 1180026626.0, + "step": 2309 + }, + { + "epoch": 0.6246619794483504, + "grad_norm": 2.890625, + "learning_rate": 0.019578546506256372, + "loss": 3.3951, + "mean_token_accuracy": 0.3968217670917511, + "num_tokens": 1180518225.0, + "step": 2310 + }, + { + "epoch": 0.6249323958896701, + "grad_norm": 95.0, + "learning_rate": 0.019578069598194272, + "loss": 11.2708, + "mean_token_accuracy": 0.006128986366093159, + "num_tokens": 1181042487.0, + "step": 2311 + }, + { + "epoch": 0.6252028123309897, + "grad_norm": 7.90625, + "learning_rate": 0.01957759242693177, + "loss": 4.1981, + "mean_token_accuracy": 0.2887968420982361, + "num_tokens": 1181566685.0, + "step": 2312 + }, + { + "epoch": 0.6254732287723094, + "grad_norm": 2.734375, + "learning_rate": 0.01957711499248351, + "loss": 3.8013, + "mean_token_accuracy": 0.3383195400238037, + "num_tokens": 1182090961.0, + "step": 2313 + }, + { + "epoch": 0.625743645213629, + "grad_norm": 3.90625, + "learning_rate": 0.019576637294864137, + "loss": 3.3924, + "mean_token_accuracy": 0.36309969425201416, + "num_tokens": 1182615121.0, + "step": 2314 + }, + { + "epoch": 0.6260140616549487, + "grad_norm": 2.46875, + "learning_rate": 0.01957615933408831, + "loss": 3.5173, + "mean_token_accuracy": 0.3633255362510681, + "num_tokens": 1183139310.0, + "step": 2315 + }, + { + "epoch": 0.6262844780962683, + "grad_norm": 2.890625, + "learning_rate": 0.01957568111017069, + "loss": 3.3503, + "mean_token_accuracy": 0.36806821823120117, + "num_tokens": 1183663545.0, + "step": 2316 + }, + { + "epoch": 0.6265548945375878, + "grad_norm": 3.359375, + "learning_rate": 0.01957520262312596, + "loss": 3.3099, + "mean_token_accuracy": 0.3945784568786621, + "num_tokens": 1184172832.0, + "step": 2317 + }, + { + "epoch": 0.6268253109789075, + "grad_norm": 4.75, + "learning_rate": 0.019574723872968793, + "loss": 3.4676, + "mean_token_accuracy": 0.34884709119796753, + "num_tokens": 1184697094.0, + "step": 2318 + }, + { + "epoch": 0.6270957274202271, + "grad_norm": 2.875, + "learning_rate": 0.01957424485971388, + "loss": 3.3704, + "mean_token_accuracy": 0.3530997633934021, + "num_tokens": 1185221247.0, + "step": 2319 + }, + { + "epoch": 0.6273661438615468, + "grad_norm": 3.171875, + "learning_rate": 0.01957376558337592, + "loss": 3.5732, + "mean_token_accuracy": 0.3743293881416321, + "num_tokens": 1185683886.0, + "step": 2320 + }, + { + "epoch": 0.6276365603028664, + "grad_norm": 3.40625, + "learning_rate": 0.019573286043969622, + "loss": 3.5244, + "mean_token_accuracy": 0.3660247325897217, + "num_tokens": 1186208005.0, + "step": 2321 + }, + { + "epoch": 0.627906976744186, + "grad_norm": 2.9375, + "learning_rate": 0.01957280624150969, + "loss": 3.214, + "mean_token_accuracy": 0.35864323377609253, + "num_tokens": 1186732198.0, + "step": 2322 + }, + { + "epoch": 0.6281773931855057, + "grad_norm": 3.015625, + "learning_rate": 0.01957232617601085, + "loss": 3.2492, + "mean_token_accuracy": 0.38103577494621277, + "num_tokens": 1187176726.0, + "step": 2323 + }, + { + "epoch": 0.6284478096268253, + "grad_norm": 2.890625, + "learning_rate": 0.019571845847487838, + "loss": 3.4926, + "mean_token_accuracy": 0.3630668520927429, + "num_tokens": 1187683401.0, + "step": 2324 + }, + { + "epoch": 0.628718226068145, + "grad_norm": 2.65625, + "learning_rate": 0.019571365255955386, + "loss": 3.3921, + "mean_token_accuracy": 0.367941677570343, + "num_tokens": 1188186123.0, + "step": 2325 + }, + { + "epoch": 0.6289886425094646, + "grad_norm": 2.484375, + "learning_rate": 0.01957088440142824, + "loss": 3.6007, + "mean_token_accuracy": 0.3621096611022949, + "num_tokens": 1188710394.0, + "step": 2326 + }, + { + "epoch": 0.6292590589507842, + "grad_norm": 3.171875, + "learning_rate": 0.019570403283921158, + "loss": 3.2742, + "mean_token_accuracy": 0.38282978534698486, + "num_tokens": 1189169890.0, + "step": 2327 + }, + { + "epoch": 0.6295294753921038, + "grad_norm": 2.5625, + "learning_rate": 0.0195699219034489, + "loss": 3.2505, + "mean_token_accuracy": 0.3777405917644501, + "num_tokens": 1189694155.0, + "step": 2328 + }, + { + "epoch": 0.6297998918334234, + "grad_norm": 3.296875, + "learning_rate": 0.019569440260026234, + "loss": 3.3022, + "mean_token_accuracy": 0.3900133967399597, + "num_tokens": 1190189500.0, + "step": 2329 + }, + { + "epoch": 0.6300703082747431, + "grad_norm": 3.625, + "learning_rate": 0.01956895835366794, + "loss": 3.678, + "mean_token_accuracy": 0.31296515464782715, + "num_tokens": 1190663040.0, + "step": 2330 + }, + { + "epoch": 0.6303407247160627, + "grad_norm": 42.5, + "learning_rate": 0.0195684761843888, + "loss": 10.7794, + "mean_token_accuracy": 0.00779195548966527, + "num_tokens": 1191143677.0, + "step": 2331 + }, + { + "epoch": 0.6306111411573824, + "grad_norm": 6.875, + "learning_rate": 0.019567993752203618, + "loss": 4.0817, + "mean_token_accuracy": 0.34255659580230713, + "num_tokens": 1191667962.0, + "step": 2332 + }, + { + "epoch": 0.630881557598702, + "grad_norm": 4.21875, + "learning_rate": 0.019567511057127188, + "loss": 3.6159, + "mean_token_accuracy": 0.3562467098236084, + "num_tokens": 1192192193.0, + "step": 2333 + }, + { + "epoch": 0.6311519740400217, + "grad_norm": 4.125, + "learning_rate": 0.01956702809917432, + "loss": 3.6997, + "mean_token_accuracy": 0.3511807322502136, + "num_tokens": 1192716411.0, + "step": 2334 + }, + { + "epoch": 0.6314223904813413, + "grad_norm": 2.984375, + "learning_rate": 0.01956654487835984, + "loss": 3.5684, + "mean_token_accuracy": 0.3491121828556061, + "num_tokens": 1193240686.0, + "step": 2335 + }, + { + "epoch": 0.6316928069226609, + "grad_norm": 2.859375, + "learning_rate": 0.01956606139469857, + "loss": 3.4189, + "mean_token_accuracy": 0.3666883111000061, + "num_tokens": 1193764799.0, + "step": 2336 + }, + { + "epoch": 0.6319632233639805, + "grad_norm": 2.390625, + "learning_rate": 0.019565577648205348, + "loss": 3.4115, + "mean_token_accuracy": 0.3639526963233948, + "num_tokens": 1194265132.0, + "step": 2337 + }, + { + "epoch": 0.6322336398053001, + "grad_norm": 2.890625, + "learning_rate": 0.019565093638895012, + "loss": 3.4701, + "mean_token_accuracy": 0.3625454902648926, + "num_tokens": 1194789210.0, + "step": 2338 + }, + { + "epoch": 0.6325040562466198, + "grad_norm": 2.84375, + "learning_rate": 0.019564609366782417, + "loss": 3.3445, + "mean_token_accuracy": 0.37532156705856323, + "num_tokens": 1195313415.0, + "step": 2339 + }, + { + "epoch": 0.6327744726879394, + "grad_norm": 2.546875, + "learning_rate": 0.019564124831882416, + "loss": 3.2645, + "mean_token_accuracy": 0.38012412190437317, + "num_tokens": 1195837561.0, + "step": 2340 + }, + { + "epoch": 0.633044889129259, + "grad_norm": 3.40625, + "learning_rate": 0.019563640034209878, + "loss": 3.303, + "mean_token_accuracy": 0.37003642320632935, + "num_tokens": 1196361829.0, + "step": 2341 + }, + { + "epoch": 0.6333153055705787, + "grad_norm": 2.671875, + "learning_rate": 0.019563154973779685, + "loss": 3.0994, + "mean_token_accuracy": 0.40274301171302795, + "num_tokens": 1196819009.0, + "step": 2342 + }, + { + "epoch": 0.6335857220118983, + "grad_norm": 2.75, + "learning_rate": 0.019562669650606712, + "loss": 3.3761, + "mean_token_accuracy": 0.3585454523563385, + "num_tokens": 1197343249.0, + "step": 2343 + }, + { + "epoch": 0.633856138453218, + "grad_norm": 2.984375, + "learning_rate": 0.019562184064705855, + "loss": 3.6527, + "mean_token_accuracy": 0.34354349970817566, + "num_tokens": 1197838806.0, + "step": 2344 + }, + { + "epoch": 0.6341265548945376, + "grad_norm": 3.234375, + "learning_rate": 0.01956169821609201, + "loss": 3.4189, + "mean_token_accuracy": 0.35199621319770813, + "num_tokens": 1198362975.0, + "step": 2345 + }, + { + "epoch": 0.6343969713358573, + "grad_norm": 3.71875, + "learning_rate": 0.019561212104780085, + "loss": 3.5624, + "mean_token_accuracy": 0.3755534887313843, + "num_tokens": 1198887259.0, + "step": 2346 + }, + { + "epoch": 0.6346673877771769, + "grad_norm": 3.328125, + "learning_rate": 0.019560725730784993, + "loss": 3.172, + "mean_token_accuracy": 0.4017171263694763, + "num_tokens": 1199285445.0, + "step": 2347 + }, + { + "epoch": 0.6349378042184964, + "grad_norm": 2.84375, + "learning_rate": 0.019560239094121665, + "loss": 3.2209, + "mean_token_accuracy": 0.37792980670928955, + "num_tokens": 1199809713.0, + "step": 2348 + }, + { + "epoch": 0.6352082206598161, + "grad_norm": 3.21875, + "learning_rate": 0.019559752194805023, + "loss": 3.4225, + "mean_token_accuracy": 0.3391588032245636, + "num_tokens": 1200333827.0, + "step": 2349 + }, + { + "epoch": 0.6354786371011357, + "grad_norm": 2.5625, + "learning_rate": 0.019559265032850017, + "loss": 3.3556, + "mean_token_accuracy": 0.3631364703178406, + "num_tokens": 1200857999.0, + "step": 2350 + }, + { + "epoch": 0.6357490535424554, + "grad_norm": 410.0, + "learning_rate": 0.019558777608271585, + "loss": 13.362, + "mean_token_accuracy": 2.6257193894707598e-06, + "num_tokens": 1201382208.0, + "step": 2351 + }, + { + "epoch": 0.636019469983775, + "grad_norm": 8.75, + "learning_rate": 0.019558289921084687, + "loss": 4.1775, + "mean_token_accuracy": 0.28649163246154785, + "num_tokens": 1201906430.0, + "step": 2352 + }, + { + "epoch": 0.6362898864250947, + "grad_norm": 2.53125, + "learning_rate": 0.019557801971304287, + "loss": 3.5166, + "mean_token_accuracy": 0.35174959897994995, + "num_tokens": 1202430581.0, + "step": 2353 + }, + { + "epoch": 0.6365603028664143, + "grad_norm": 2.828125, + "learning_rate": 0.01955731375894535, + "loss": 3.5558, + "mean_token_accuracy": 0.35839205980300903, + "num_tokens": 1202923333.0, + "step": 2354 + }, + { + "epoch": 0.6368307193077339, + "grad_norm": 3.0, + "learning_rate": 0.019556825284022864, + "loss": 3.3965, + "mean_token_accuracy": 0.368718683719635, + "num_tokens": 1203399907.0, + "step": 2355 + }, + { + "epoch": 0.6371011357490536, + "grad_norm": 3.46875, + "learning_rate": 0.019556336546551818, + "loss": 3.5718, + "mean_token_accuracy": 0.3649557828903198, + "num_tokens": 1203924012.0, + "step": 2356 + }, + { + "epoch": 0.6373715521903732, + "grad_norm": 3.28125, + "learning_rate": 0.019555847546547205, + "loss": 3.3658, + "mean_token_accuracy": 0.36996957659721375, + "num_tokens": 1204410517.0, + "step": 2357 + }, + { + "epoch": 0.6376419686316928, + "grad_norm": 2.859375, + "learning_rate": 0.01955535828402402, + "loss": 3.334, + "mean_token_accuracy": 0.36186468601226807, + "num_tokens": 1204933983.0, + "step": 2358 + }, + { + "epoch": 0.6379123850730124, + "grad_norm": 2.84375, + "learning_rate": 0.019554868758997288, + "loss": 3.5039, + "mean_token_accuracy": 0.35891997814178467, + "num_tokens": 1205458202.0, + "step": 2359 + }, + { + "epoch": 0.638182801514332, + "grad_norm": 3.296875, + "learning_rate": 0.019554378971482027, + "loss": 3.3821, + "mean_token_accuracy": 0.36877503991127014, + "num_tokens": 1205972446.0, + "step": 2360 + }, + { + "epoch": 0.6384532179556517, + "grad_norm": 3.953125, + "learning_rate": 0.01955388892149326, + "loss": 3.6405, + "mean_token_accuracy": 0.3741099238395691, + "num_tokens": 1206496602.0, + "step": 2361 + }, + { + "epoch": 0.6387236343969713, + "grad_norm": 3.0, + "learning_rate": 0.019553398609046025, + "loss": 3.341, + "mean_token_accuracy": 0.3630051016807556, + "num_tokens": 1207020692.0, + "step": 2362 + }, + { + "epoch": 0.638994050838291, + "grad_norm": 2.96875, + "learning_rate": 0.019552908034155367, + "loss": 3.0908, + "mean_token_accuracy": 0.39812299609184265, + "num_tokens": 1207544948.0, + "step": 2363 + }, + { + "epoch": 0.6392644672796106, + "grad_norm": 3.625, + "learning_rate": 0.019552417196836338, + "loss": 3.4773, + "mean_token_accuracy": 0.357951819896698, + "num_tokens": 1208069166.0, + "step": 2364 + }, + { + "epoch": 0.6395348837209303, + "grad_norm": 3.328125, + "learning_rate": 0.019551926097104, + "loss": 3.2625, + "mean_token_accuracy": 0.3598661720752716, + "num_tokens": 1208593348.0, + "step": 2365 + }, + { + "epoch": 0.6398053001622499, + "grad_norm": 3.390625, + "learning_rate": 0.01955143473497342, + "loss": 3.4632, + "mean_token_accuracy": 0.3924332857131958, + "num_tokens": 1209117367.0, + "step": 2366 + }, + { + "epoch": 0.6400757166035695, + "grad_norm": 3.0625, + "learning_rate": 0.01955094311045967, + "loss": 3.4479, + "mean_token_accuracy": 0.38660770654678345, + "num_tokens": 1209527993.0, + "step": 2367 + }, + { + "epoch": 0.6403461330448891, + "grad_norm": 2.75, + "learning_rate": 0.01955045122357784, + "loss": 3.2502, + "mean_token_accuracy": 0.37687650322914124, + "num_tokens": 1210017958.0, + "step": 2368 + }, + { + "epoch": 0.6406165494862087, + "grad_norm": 2.921875, + "learning_rate": 0.019549959074343023, + "loss": 3.2575, + "mean_token_accuracy": 0.3798710107803345, + "num_tokens": 1210542123.0, + "step": 2369 + }, + { + "epoch": 0.6408869659275284, + "grad_norm": 2.59375, + "learning_rate": 0.01954946666277032, + "loss": 3.4189, + "mean_token_accuracy": 0.38128602504730225, + "num_tokens": 1211066327.0, + "step": 2370 + }, + { + "epoch": 0.641157382368848, + "grad_norm": 5.78125, + "learning_rate": 0.019548973988874833, + "loss": 11.0276, + "mean_token_accuracy": 5.472583325172309e-06, + "num_tokens": 1211590581.0, + "step": 2371 + }, + { + "epoch": 0.6414277988101676, + "grad_norm": 5.9375, + "learning_rate": 0.019548481052671686, + "loss": 3.7045, + "mean_token_accuracy": 0.28712326288223267, + "num_tokens": 1212114668.0, + "step": 2372 + }, + { + "epoch": 0.6416982152514873, + "grad_norm": 3.0, + "learning_rate": 0.019547987854176, + "loss": 3.8798, + "mean_token_accuracy": 0.33273592591285706, + "num_tokens": 1212638758.0, + "step": 2373 + }, + { + "epoch": 0.6419686316928069, + "grad_norm": 2.734375, + "learning_rate": 0.019547494393402906, + "loss": 3.5422, + "mean_token_accuracy": 0.3583298325538635, + "num_tokens": 1213163040.0, + "step": 2374 + }, + { + "epoch": 0.6422390481341266, + "grad_norm": 3.171875, + "learning_rate": 0.01954700067036755, + "loss": 3.4214, + "mean_token_accuracy": 0.35723209381103516, + "num_tokens": 1213687251.0, + "step": 2375 + }, + { + "epoch": 0.6425094645754462, + "grad_norm": 2.734375, + "learning_rate": 0.01954650668508508, + "loss": 3.3677, + "mean_token_accuracy": 0.35766372084617615, + "num_tokens": 1214211538.0, + "step": 2376 + }, + { + "epoch": 0.6427798810167659, + "grad_norm": 3.796875, + "learning_rate": 0.019546012437570648, + "loss": 3.7739, + "mean_token_accuracy": 0.33402499556541443, + "num_tokens": 1214702973.0, + "step": 2377 + }, + { + "epoch": 0.6430502974580855, + "grad_norm": 2.6875, + "learning_rate": 0.019545517927839425, + "loss": 3.35, + "mean_token_accuracy": 0.37630945444107056, + "num_tokens": 1215227092.0, + "step": 2378 + }, + { + "epoch": 0.643320713899405, + "grad_norm": 2.984375, + "learning_rate": 0.01954502315590658, + "loss": 3.3476, + "mean_token_accuracy": 0.3572899103164673, + "num_tokens": 1215751378.0, + "step": 2379 + }, + { + "epoch": 0.6435911303407247, + "grad_norm": 2.34375, + "learning_rate": 0.019544528121787298, + "loss": 3.4214, + "mean_token_accuracy": 0.37918156385421753, + "num_tokens": 1216238362.0, + "step": 2380 + }, + { + "epoch": 0.6438615467820443, + "grad_norm": 2.609375, + "learning_rate": 0.01954403282549676, + "loss": 3.3333, + "mean_token_accuracy": 0.37679457664489746, + "num_tokens": 1216712013.0, + "step": 2381 + }, + { + "epoch": 0.644131963223364, + "grad_norm": 2.671875, + "learning_rate": 0.01954353726705017, + "loss": 3.3437, + "mean_token_accuracy": 0.373176634311676, + "num_tokens": 1217236244.0, + "step": 2382 + }, + { + "epoch": 0.6444023796646836, + "grad_norm": 2.84375, + "learning_rate": 0.019543041446462735, + "loss": 3.5871, + "mean_token_accuracy": 0.3408099412918091, + "num_tokens": 1217760452.0, + "step": 2383 + }, + { + "epoch": 0.6446727961060033, + "grad_norm": 2.609375, + "learning_rate": 0.01954254536374966, + "loss": 3.3412, + "mean_token_accuracy": 0.3749254047870636, + "num_tokens": 1218284656.0, + "step": 2384 + }, + { + "epoch": 0.6449432125473229, + "grad_norm": 3.125, + "learning_rate": 0.019542049018926175, + "loss": 3.4824, + "mean_token_accuracy": 0.3417366147041321, + "num_tokens": 1218808920.0, + "step": 2385 + }, + { + "epoch": 0.6452136289886425, + "grad_norm": 3.15625, + "learning_rate": 0.019541552412007505, + "loss": 3.5583, + "mean_token_accuracy": 0.37402838468551636, + "num_tokens": 1219333189.0, + "step": 2386 + }, + { + "epoch": 0.6454840454299622, + "grad_norm": 2.984375, + "learning_rate": 0.01954105554300889, + "loss": 3.2471, + "mean_token_accuracy": 0.37737584114074707, + "num_tokens": 1219857422.0, + "step": 2387 + }, + { + "epoch": 0.6457544618712818, + "grad_norm": 2.578125, + "learning_rate": 0.01954055841194557, + "loss": 3.2265, + "mean_token_accuracy": 0.3935703635215759, + "num_tokens": 1220322949.0, + "step": 2388 + }, + { + "epoch": 0.6460248783126014, + "grad_norm": 4.4375, + "learning_rate": 0.0195400610188328, + "loss": 3.4967, + "mean_token_accuracy": 0.34418779611587524, + "num_tokens": 1220847205.0, + "step": 2389 + }, + { + "epoch": 0.646295294753921, + "grad_norm": 3.796875, + "learning_rate": 0.019539563363685848, + "loss": 3.53, + "mean_token_accuracy": 0.3969455659389496, + "num_tokens": 1221311597.0, + "step": 2390 + }, + { + "epoch": 0.6465657111952406, + "grad_norm": 57.25, + "learning_rate": 0.01953906544651998, + "loss": 10.0246, + "mean_token_accuracy": 0.02010277658700943, + "num_tokens": 1221835684.0, + "step": 2391 + }, + { + "epoch": 0.6468361276365603, + "grad_norm": 7.96875, + "learning_rate": 0.019538567267350468, + "loss": 4.3432, + "mean_token_accuracy": 0.25191211700439453, + "num_tokens": 1222359854.0, + "step": 2392 + }, + { + "epoch": 0.6471065440778799, + "grad_norm": 3.046875, + "learning_rate": 0.019538068826192603, + "loss": 3.4715, + "mean_token_accuracy": 0.34017542004585266, + "num_tokens": 1222884085.0, + "step": 2393 + }, + { + "epoch": 0.6473769605191996, + "grad_norm": 2.46875, + "learning_rate": 0.019537570123061676, + "loss": 3.6125, + "mean_token_accuracy": 0.3753238320350647, + "num_tokens": 1223333149.0, + "step": 2394 + }, + { + "epoch": 0.6476473769605192, + "grad_norm": 3.078125, + "learning_rate": 0.019537071157972992, + "loss": 3.1803, + "mean_token_accuracy": 0.40143612027168274, + "num_tokens": 1223800107.0, + "step": 2395 + }, + { + "epoch": 0.6479177934018389, + "grad_norm": 2.96875, + "learning_rate": 0.01953657193094186, + "loss": 3.1974, + "mean_token_accuracy": 0.4085237383842468, + "num_tokens": 1224324215.0, + "step": 2396 + }, + { + "epoch": 0.6481882098431585, + "grad_norm": 2.703125, + "learning_rate": 0.019536072441983597, + "loss": 3.2967, + "mean_token_accuracy": 0.39611655473709106, + "num_tokens": 1224848382.0, + "step": 2397 + }, + { + "epoch": 0.6484586262844781, + "grad_norm": 2.6875, + "learning_rate": 0.01953557269111353, + "loss": 3.3164, + "mean_token_accuracy": 0.37882035970687866, + "num_tokens": 1225372536.0, + "step": 2398 + }, + { + "epoch": 0.6487290427257977, + "grad_norm": 3.265625, + "learning_rate": 0.019535072678346983, + "loss": 3.4581, + "mean_token_accuracy": 0.3637794554233551, + "num_tokens": 1225890515.0, + "step": 2399 + }, + { + "epoch": 0.6489994591671173, + "grad_norm": 3.5625, + "learning_rate": 0.019534572403699314, + "loss": 3.3414, + "mean_token_accuracy": 0.37805309891700745, + "num_tokens": 1226387081.0, + "step": 2400 + }, + { + "epoch": 0.649269875608437, + "grad_norm": 3.265625, + "learning_rate": 0.01953407186718586, + "loss": 3.4731, + "mean_token_accuracy": 0.37862762808799744, + "num_tokens": 1226911261.0, + "step": 2401 + }, + { + "epoch": 0.6495402920497566, + "grad_norm": 2.328125, + "learning_rate": 0.01953357106882199, + "loss": 3.3159, + "mean_token_accuracy": 0.371537983417511, + "num_tokens": 1227435405.0, + "step": 2402 + }, + { + "epoch": 0.6498107084910762, + "grad_norm": 2.71875, + "learning_rate": 0.01953307000862306, + "loss": 3.3669, + "mean_token_accuracy": 0.36628690361976624, + "num_tokens": 1227959591.0, + "step": 2403 + }, + { + "epoch": 0.6500811249323959, + "grad_norm": 2.671875, + "learning_rate": 0.019532568686604447, + "loss": 3.5038, + "mean_token_accuracy": 0.37758514285087585, + "num_tokens": 1228460121.0, + "step": 2404 + }, + { + "epoch": 0.6503515413737155, + "grad_norm": 3.15625, + "learning_rate": 0.01953206710278154, + "loss": 3.3752, + "mean_token_accuracy": 0.36255741119384766, + "num_tokens": 1228984278.0, + "step": 2405 + }, + { + "epoch": 0.6506219578150352, + "grad_norm": 2.734375, + "learning_rate": 0.019531565257169717, + "loss": 3.3489, + "mean_token_accuracy": 0.40167510509490967, + "num_tokens": 1229508474.0, + "step": 2406 + }, + { + "epoch": 0.6508923742563548, + "grad_norm": 3.890625, + "learning_rate": 0.019531063149784384, + "loss": 3.157, + "mean_token_accuracy": 0.3780336081981659, + "num_tokens": 1230021649.0, + "step": 2407 + }, + { + "epoch": 0.6511627906976745, + "grad_norm": 3.25, + "learning_rate": 0.019530560780640947, + "loss": 3.348, + "mean_token_accuracy": 0.374289870262146, + "num_tokens": 1230545783.0, + "step": 2408 + }, + { + "epoch": 0.651433207138994, + "grad_norm": 3.0625, + "learning_rate": 0.01953005814975482, + "loss": 3.6079, + "mean_token_accuracy": 0.35069331526756287, + "num_tokens": 1231069887.0, + "step": 2409 + }, + { + "epoch": 0.6517036235803136, + "grad_norm": 3.6875, + "learning_rate": 0.01952955525714142, + "loss": 3.5234, + "mean_token_accuracy": 0.3520622253417969, + "num_tokens": 1231594165.0, + "step": 2410 + }, + { + "epoch": 0.6519740400216333, + "grad_norm": 205.0, + "learning_rate": 0.01952905210281618, + "loss": 14.4366, + "mean_token_accuracy": 0.010308126918971539, + "num_tokens": 1232118239.0, + "step": 2411 + }, + { + "epoch": 0.6522444564629529, + "grad_norm": 12.75, + "learning_rate": 0.01952854868679454, + "loss": 4.4886, + "mean_token_accuracy": 0.309997022151947, + "num_tokens": 1232594396.0, + "step": 2412 + }, + { + "epoch": 0.6525148729042726, + "grad_norm": 3.5, + "learning_rate": 0.019528045009091947, + "loss": 3.6881, + "mean_token_accuracy": 0.3395300805568695, + "num_tokens": 1233118524.0, + "step": 2413 + }, + { + "epoch": 0.6527852893455922, + "grad_norm": 3.234375, + "learning_rate": 0.019527541069723854, + "loss": 3.3334, + "mean_token_accuracy": 0.40185853838920593, + "num_tokens": 1233642800.0, + "step": 2414 + }, + { + "epoch": 0.6530557057869119, + "grad_norm": 2.71875, + "learning_rate": 0.01952703686870572, + "loss": 3.5728, + "mean_token_accuracy": 0.34638822078704834, + "num_tokens": 1234166988.0, + "step": 2415 + }, + { + "epoch": 0.6533261222282315, + "grad_norm": 3.609375, + "learning_rate": 0.01952653240605302, + "loss": 3.381, + "mean_token_accuracy": 0.3588468134403229, + "num_tokens": 1234691155.0, + "step": 2416 + }, + { + "epoch": 0.6535965386695511, + "grad_norm": 2.5625, + "learning_rate": 0.019526027681781234, + "loss": 3.4313, + "mean_token_accuracy": 0.3751620054244995, + "num_tokens": 1235215429.0, + "step": 2417 + }, + { + "epoch": 0.6538669551108708, + "grad_norm": 3.265625, + "learning_rate": 0.019525522695905843, + "loss": 3.4468, + "mean_token_accuracy": 0.3570477366447449, + "num_tokens": 1235739562.0, + "step": 2418 + }, + { + "epoch": 0.6541373715521904, + "grad_norm": 3.09375, + "learning_rate": 0.019525017448442344, + "loss": 3.4112, + "mean_token_accuracy": 0.3684285581111908, + "num_tokens": 1236263752.0, + "step": 2419 + }, + { + "epoch": 0.65440778799351, + "grad_norm": 2.71875, + "learning_rate": 0.019524511939406238, + "loss": 3.5144, + "mean_token_accuracy": 0.34790727496147156, + "num_tokens": 1236788030.0, + "step": 2420 + }, + { + "epoch": 0.6546782044348296, + "grad_norm": 2.78125, + "learning_rate": 0.01952400616881304, + "loss": 3.1905, + "mean_token_accuracy": 0.37594932317733765, + "num_tokens": 1237312164.0, + "step": 2421 + }, + { + "epoch": 0.6549486208761492, + "grad_norm": 3.25, + "learning_rate": 0.01952350013667826, + "loss": 3.2757, + "mean_token_accuracy": 0.3791821002960205, + "num_tokens": 1237836409.0, + "step": 2422 + }, + { + "epoch": 0.6552190373174689, + "grad_norm": 2.84375, + "learning_rate": 0.019522993843017436, + "loss": 3.559, + "mean_token_accuracy": 0.35558080673217773, + "num_tokens": 1238348566.0, + "step": 2423 + }, + { + "epoch": 0.6554894537587885, + "grad_norm": 5.5, + "learning_rate": 0.019522487287846092, + "loss": 3.0897, + "mean_token_accuracy": 0.38043564558029175, + "num_tokens": 1238866143.0, + "step": 2424 + }, + { + "epoch": 0.6557598702001082, + "grad_norm": 2.03125, + "learning_rate": 0.019521980471179777, + "loss": 3.4396, + "mean_token_accuracy": 0.40035420656204224, + "num_tokens": 1239322027.0, + "step": 2425 + }, + { + "epoch": 0.6560302866414278, + "grad_norm": 4.6875, + "learning_rate": 0.01952147339303404, + "loss": 3.5859, + "mean_token_accuracy": 0.3435448706150055, + "num_tokens": 1239819912.0, + "step": 2426 + }, + { + "epoch": 0.6563007030827475, + "grad_norm": 2.40625, + "learning_rate": 0.01952096605342444, + "loss": 3.3734, + "mean_token_accuracy": 0.35313618183135986, + "num_tokens": 1240344033.0, + "step": 2427 + }, + { + "epoch": 0.6565711195240671, + "grad_norm": 4.25, + "learning_rate": 0.019520458452366538, + "loss": 3.7445, + "mean_token_accuracy": 0.3254871666431427, + "num_tokens": 1240868165.0, + "step": 2428 + }, + { + "epoch": 0.6568415359653867, + "grad_norm": 2.578125, + "learning_rate": 0.019519950589875917, + "loss": 3.1919, + "mean_token_accuracy": 0.3866145610809326, + "num_tokens": 1241392448.0, + "step": 2429 + }, + { + "epoch": 0.6571119524067063, + "grad_norm": 2.890625, + "learning_rate": 0.019519442465968154, + "loss": 3.5668, + "mean_token_accuracy": 0.35347163677215576, + "num_tokens": 1241916695.0, + "step": 2430 + }, + { + "epoch": 0.6573823688480259, + "grad_norm": 120.0, + "learning_rate": 0.019518934080658846, + "loss": 22.1418, + "mean_token_accuracy": 0.0, + "num_tokens": 1242440948.0, + "step": 2431 + }, + { + "epoch": 0.6576527852893456, + "grad_norm": 9.1875, + "learning_rate": 0.019518425433963586, + "loss": 4.2959, + "mean_token_accuracy": 0.28635352849960327, + "num_tokens": 1242961017.0, + "step": 2432 + }, + { + "epoch": 0.6579232017306652, + "grad_norm": 3.140625, + "learning_rate": 0.01951791652589798, + "loss": 3.5232, + "mean_token_accuracy": 0.3935021758079529, + "num_tokens": 1243485183.0, + "step": 2433 + }, + { + "epoch": 0.6581936181719849, + "grad_norm": 3.1875, + "learning_rate": 0.019517407356477653, + "loss": 3.638, + "mean_token_accuracy": 0.3660963177680969, + "num_tokens": 1244009402.0, + "step": 2434 + }, + { + "epoch": 0.6584640346133045, + "grad_norm": 3.328125, + "learning_rate": 0.019516897925718216, + "loss": 3.577, + "mean_token_accuracy": 0.34592342376708984, + "num_tokens": 1244533588.0, + "step": 2435 + }, + { + "epoch": 0.6587344510546241, + "grad_norm": 2.28125, + "learning_rate": 0.0195163882336353, + "loss": 3.3845, + "mean_token_accuracy": 0.3841525912284851, + "num_tokens": 1245057803.0, + "step": 2436 + }, + { + "epoch": 0.6590048674959438, + "grad_norm": 3.203125, + "learning_rate": 0.019515878280244554, + "loss": 3.5428, + "mean_token_accuracy": 0.3557608723640442, + "num_tokens": 1245582012.0, + "step": 2437 + }, + { + "epoch": 0.6592752839372634, + "grad_norm": 2.6875, + "learning_rate": 0.019515368065561613, + "loss": 3.4023, + "mean_token_accuracy": 0.3731633424758911, + "num_tokens": 1246106173.0, + "step": 2438 + }, + { + "epoch": 0.6595457003785831, + "grad_norm": 2.953125, + "learning_rate": 0.01951485758960214, + "loss": 3.357, + "mean_token_accuracy": 0.37179476022720337, + "num_tokens": 1246630354.0, + "step": 2439 + }, + { + "epoch": 0.6598161168199026, + "grad_norm": 8.5625, + "learning_rate": 0.0195143468523818, + "loss": 3.3817, + "mean_token_accuracy": 0.37644344568252563, + "num_tokens": 1247154437.0, + "step": 2440 + }, + { + "epoch": 0.6600865332612222, + "grad_norm": 2.09375, + "learning_rate": 0.019513835853916254, + "loss": 3.5537, + "mean_token_accuracy": 0.36853229999542236, + "num_tokens": 1247678674.0, + "step": 2441 + }, + { + "epoch": 0.6603569497025419, + "grad_norm": 2.78125, + "learning_rate": 0.019513324594221188, + "loss": 3.4025, + "mean_token_accuracy": 0.4131871163845062, + "num_tokens": 1248202790.0, + "step": 2442 + }, + { + "epoch": 0.6606273661438615, + "grad_norm": 2.359375, + "learning_rate": 0.019512813073312287, + "loss": 3.2186, + "mean_token_accuracy": 0.3877360224723816, + "num_tokens": 1248714704.0, + "step": 2443 + }, + { + "epoch": 0.6608977825851812, + "grad_norm": 3.546875, + "learning_rate": 0.01951230129120525, + "loss": 3.4235, + "mean_token_accuracy": 0.3754523992538452, + "num_tokens": 1249238917.0, + "step": 2444 + }, + { + "epoch": 0.6611681990265008, + "grad_norm": 2.96875, + "learning_rate": 0.019511789247915773, + "loss": 3.5117, + "mean_token_accuracy": 0.3560165464878082, + "num_tokens": 1249760817.0, + "step": 2445 + }, + { + "epoch": 0.6614386154678205, + "grad_norm": 2.453125, + "learning_rate": 0.019511276943459572, + "loss": 3.3244, + "mean_token_accuracy": 0.36924678087234497, + "num_tokens": 1250285083.0, + "step": 2446 + }, + { + "epoch": 0.6617090319091401, + "grad_norm": 3.5, + "learning_rate": 0.01951076437785236, + "loss": 3.538, + "mean_token_accuracy": 0.33726248145103455, + "num_tokens": 1250809365.0, + "step": 2447 + }, + { + "epoch": 0.6619794483504597, + "grad_norm": 3.46875, + "learning_rate": 0.019510251551109876, + "loss": 3.5366, + "mean_token_accuracy": 0.3610367178916931, + "num_tokens": 1251331362.0, + "step": 2448 + }, + { + "epoch": 0.6622498647917794, + "grad_norm": 3.203125, + "learning_rate": 0.01950973846324784, + "loss": 3.47, + "mean_token_accuracy": 0.38953322172164917, + "num_tokens": 1251797016.0, + "step": 2449 + }, + { + "epoch": 0.6625202812330989, + "grad_norm": 3.03125, + "learning_rate": 0.019509225114282006, + "loss": 3.4795, + "mean_token_accuracy": 0.37462663650512695, + "num_tokens": 1252299382.0, + "step": 2450 + }, + { + "epoch": 0.6627906976744186, + "grad_norm": 65.0, + "learning_rate": 0.01950871150422812, + "loss": 26.6199, + "mean_token_accuracy": 0.019609544426202774, + "num_tokens": 1252823643.0, + "step": 2451 + }, + { + "epoch": 0.6630611141157382, + "grad_norm": 9.6875, + "learning_rate": 0.019508197633101944, + "loss": 4.0303, + "mean_token_accuracy": 0.3041863441467285, + "num_tokens": 1253335054.0, + "step": 2452 + }, + { + "epoch": 0.6633315305570578, + "grad_norm": 3.0625, + "learning_rate": 0.01950768350091924, + "loss": 3.4594, + "mean_token_accuracy": 0.365037202835083, + "num_tokens": 1253859236.0, + "step": 2453 + }, + { + "epoch": 0.6636019469983775, + "grad_norm": 4.15625, + "learning_rate": 0.01950716910769579, + "loss": 3.5918, + "mean_token_accuracy": 0.3574674725532532, + "num_tokens": 1254383523.0, + "step": 2454 + }, + { + "epoch": 0.6638723634396971, + "grad_norm": 3.359375, + "learning_rate": 0.019506654453447375, + "loss": 3.4756, + "mean_token_accuracy": 0.387269526720047, + "num_tokens": 1254863341.0, + "step": 2455 + }, + { + "epoch": 0.6641427798810168, + "grad_norm": 5.40625, + "learning_rate": 0.019506139538189783, + "loss": 3.6141, + "mean_token_accuracy": 0.3464951515197754, + "num_tokens": 1255387512.0, + "step": 2456 + }, + { + "epoch": 0.6644131963223364, + "grad_norm": 2.640625, + "learning_rate": 0.019505624361938815, + "loss": 3.3958, + "mean_token_accuracy": 0.37197256088256836, + "num_tokens": 1255911630.0, + "step": 2457 + }, + { + "epoch": 0.6646836127636561, + "grad_norm": 4.3125, + "learning_rate": 0.01950510892471028, + "loss": 3.4322, + "mean_token_accuracy": 0.36372971534729004, + "num_tokens": 1256435799.0, + "step": 2458 + }, + { + "epoch": 0.6649540292049757, + "grad_norm": 2.203125, + "learning_rate": 0.01950459322651999, + "loss": 3.2808, + "mean_token_accuracy": 0.38831278681755066, + "num_tokens": 1256959978.0, + "step": 2459 + }, + { + "epoch": 0.6652244456462953, + "grad_norm": 3.078125, + "learning_rate": 0.01950407726738377, + "loss": 3.6226, + "mean_token_accuracy": 0.3306843042373657, + "num_tokens": 1257484111.0, + "step": 2460 + }, + { + "epoch": 0.6654948620876149, + "grad_norm": 2.0625, + "learning_rate": 0.01950356104731745, + "loss": 3.322, + "mean_token_accuracy": 0.383220911026001, + "num_tokens": 1258008261.0, + "step": 2461 + }, + { + "epoch": 0.6657652785289345, + "grad_norm": 3.4375, + "learning_rate": 0.01950304456633687, + "loss": 3.5672, + "mean_token_accuracy": 0.35016119480133057, + "num_tokens": 1258532429.0, + "step": 2462 + }, + { + "epoch": 0.6660356949702542, + "grad_norm": 2.90625, + "learning_rate": 0.019502527824457876, + "loss": 3.5526, + "mean_token_accuracy": 0.35442671179771423, + "num_tokens": 1259056684.0, + "step": 2463 + }, + { + "epoch": 0.6663061114115738, + "grad_norm": 3.96875, + "learning_rate": 0.019502010821696326, + "loss": 3.2585, + "mean_token_accuracy": 0.36815857887268066, + "num_tokens": 1259532432.0, + "step": 2464 + }, + { + "epoch": 0.6665765278528935, + "grad_norm": 2.578125, + "learning_rate": 0.01950149355806808, + "loss": 3.2182, + "mean_token_accuracy": 0.3833411633968353, + "num_tokens": 1260040147.0, + "step": 2465 + }, + { + "epoch": 0.6668469442942131, + "grad_norm": 3.65625, + "learning_rate": 0.01950097603358901, + "loss": 2.7695, + "mean_token_accuracy": 0.484943151473999, + "num_tokens": 1260518333.0, + "step": 2466 + }, + { + "epoch": 0.6671173607355327, + "grad_norm": 2.375, + "learning_rate": 0.019500458248274996, + "loss": 3.4223, + "mean_token_accuracy": 0.37486088275909424, + "num_tokens": 1261035133.0, + "step": 2467 + }, + { + "epoch": 0.6673877771768524, + "grad_norm": 3.859375, + "learning_rate": 0.019499940202141925, + "loss": 3.5788, + "mean_token_accuracy": 0.33858877420425415, + "num_tokens": 1261559235.0, + "step": 2468 + }, + { + "epoch": 0.667658193618172, + "grad_norm": 2.390625, + "learning_rate": 0.01949942189520569, + "loss": 3.3674, + "mean_token_accuracy": 0.3821987509727478, + "num_tokens": 1262083452.0, + "step": 2469 + }, + { + "epoch": 0.6679286100594917, + "grad_norm": 3.5, + "learning_rate": 0.019498903327482196, + "loss": 3.3974, + "mean_token_accuracy": 0.3750452995300293, + "num_tokens": 1262607709.0, + "step": 2470 + }, + { + "epoch": 0.6681990265008112, + "grad_norm": 65.5, + "learning_rate": 0.01949838449898736, + "loss": 28.0693, + "mean_token_accuracy": 0.0, + "num_tokens": 1263131967.0, + "step": 2471 + }, + { + "epoch": 0.6684694429421308, + "grad_norm": 6.78125, + "learning_rate": 0.01949786540973709, + "loss": 4.0407, + "mean_token_accuracy": 0.3513484001159668, + "num_tokens": 1263577361.0, + "step": 2472 + }, + { + "epoch": 0.6687398593834505, + "grad_norm": 3.046875, + "learning_rate": 0.019497346059747317, + "loss": 3.5144, + "mean_token_accuracy": 0.3673431873321533, + "num_tokens": 1264101403.0, + "step": 2473 + }, + { + "epoch": 0.6690102758247701, + "grad_norm": 3.546875, + "learning_rate": 0.019496826449033985, + "loss": 3.7127, + "mean_token_accuracy": 0.3549530506134033, + "num_tokens": 1264625503.0, + "step": 2474 + }, + { + "epoch": 0.6692806922660898, + "grad_norm": 2.953125, + "learning_rate": 0.019496306577613025, + "loss": 3.3906, + "mean_token_accuracy": 0.3643738031387329, + "num_tokens": 1265149554.0, + "step": 2475 + }, + { + "epoch": 0.6695511087074094, + "grad_norm": 2.421875, + "learning_rate": 0.01949578644550039, + "loss": 3.3898, + "mean_token_accuracy": 0.3983260691165924, + "num_tokens": 1265607962.0, + "step": 2476 + }, + { + "epoch": 0.6698215251487291, + "grad_norm": 2.453125, + "learning_rate": 0.019495266052712047, + "loss": 3.3923, + "mean_token_accuracy": 0.3731761574745178, + "num_tokens": 1266132241.0, + "step": 2477 + }, + { + "epoch": 0.6700919415900487, + "grad_norm": 3.328125, + "learning_rate": 0.01949474539926396, + "loss": 3.519, + "mean_token_accuracy": 0.3570188879966736, + "num_tokens": 1266656514.0, + "step": 2478 + }, + { + "epoch": 0.6703623580313683, + "grad_norm": 2.953125, + "learning_rate": 0.0194942244851721, + "loss": 3.5703, + "mean_token_accuracy": 0.3645786643028259, + "num_tokens": 1267137348.0, + "step": 2479 + }, + { + "epoch": 0.670632774472688, + "grad_norm": 2.71875, + "learning_rate": 0.019493703310452453, + "loss": 3.3776, + "mean_token_accuracy": 0.37767839431762695, + "num_tokens": 1267616680.0, + "step": 2480 + }, + { + "epoch": 0.6709031909140075, + "grad_norm": 2.359375, + "learning_rate": 0.019493181875121012, + "loss": 3.1667, + "mean_token_accuracy": 0.39586126804351807, + "num_tokens": 1268111456.0, + "step": 2481 + }, + { + "epoch": 0.6711736073553272, + "grad_norm": 2.25, + "learning_rate": 0.01949266017919377, + "loss": 3.4176, + "mean_token_accuracy": 0.39022842049598694, + "num_tokens": 1268609177.0, + "step": 2482 + }, + { + "epoch": 0.6714440237966468, + "grad_norm": 3.46875, + "learning_rate": 0.019492138222686743, + "loss": 3.1085, + "mean_token_accuracy": 0.39547866582870483, + "num_tokens": 1269133372.0, + "step": 2483 + }, + { + "epoch": 0.6717144402379664, + "grad_norm": 3.03125, + "learning_rate": 0.019491616005615938, + "loss": 3.3427, + "mean_token_accuracy": 0.3646683096885681, + "num_tokens": 1269648578.0, + "step": 2484 + }, + { + "epoch": 0.6719848566792861, + "grad_norm": 3.34375, + "learning_rate": 0.019491093527997386, + "loss": 3.268, + "mean_token_accuracy": 0.3789057433605194, + "num_tokens": 1270172752.0, + "step": 2485 + }, + { + "epoch": 0.6722552731206057, + "grad_norm": 3.34375, + "learning_rate": 0.019490570789847114, + "loss": 3.3376, + "mean_token_accuracy": 0.38944169878959656, + "num_tokens": 1270673113.0, + "step": 2486 + }, + { + "epoch": 0.6725256895619254, + "grad_norm": 4.21875, + "learning_rate": 0.019490047791181155, + "loss": 3.4984, + "mean_token_accuracy": 0.3616258502006531, + "num_tokens": 1271197382.0, + "step": 2487 + }, + { + "epoch": 0.672796106003245, + "grad_norm": 3.3125, + "learning_rate": 0.019489524532015565, + "loss": 3.4105, + "mean_token_accuracy": 0.37961623072624207, + "num_tokens": 1271721654.0, + "step": 2488 + }, + { + "epoch": 0.6730665224445647, + "grad_norm": 4.0, + "learning_rate": 0.019489001012366395, + "loss": 3.4621, + "mean_token_accuracy": 0.3530566096305847, + "num_tokens": 1272193569.0, + "step": 2489 + }, + { + "epoch": 0.6733369388858843, + "grad_norm": 2.171875, + "learning_rate": 0.019488477232249713, + "loss": 3.2359, + "mean_token_accuracy": 0.3900718092918396, + "num_tokens": 1272717804.0, + "step": 2490 + }, + { + "epoch": 0.673607355327204, + "grad_norm": 21.0, + "learning_rate": 0.019487953191681586, + "loss": 14.8298, + "mean_token_accuracy": 0.013672230765223503, + "num_tokens": 1273242050.0, + "step": 2491 + }, + { + "epoch": 0.6738777717685235, + "grad_norm": 8.4375, + "learning_rate": 0.019487428890678095, + "loss": 4.2451, + "mean_token_accuracy": 0.26609891653060913, + "num_tokens": 1273766325.0, + "step": 2492 + }, + { + "epoch": 0.6741481882098431, + "grad_norm": 2.65625, + "learning_rate": 0.01948690432925532, + "loss": 3.5565, + "mean_token_accuracy": 0.3441411256790161, + "num_tokens": 1274290439.0, + "step": 2493 + }, + { + "epoch": 0.6744186046511628, + "grad_norm": 3.3125, + "learning_rate": 0.019486379507429364, + "loss": 3.6961, + "mean_token_accuracy": 0.3897817134857178, + "num_tokens": 1274738996.0, + "step": 2494 + }, + { + "epoch": 0.6746890210924824, + "grad_norm": 3.03125, + "learning_rate": 0.01948585442521633, + "loss": 3.6289, + "mean_token_accuracy": 0.32738131284713745, + "num_tokens": 1275249010.0, + "step": 2495 + }, + { + "epoch": 0.674959437533802, + "grad_norm": 3.09375, + "learning_rate": 0.019485329082632327, + "loss": 3.6057, + "mean_token_accuracy": 0.37154722213745117, + "num_tokens": 1275773281.0, + "step": 2496 + }, + { + "epoch": 0.6752298539751217, + "grad_norm": 3.4375, + "learning_rate": 0.01948480347969347, + "loss": 3.5197, + "mean_token_accuracy": 0.3452133536338806, + "num_tokens": 1276297499.0, + "step": 2497 + }, + { + "epoch": 0.6755002704164413, + "grad_norm": 2.390625, + "learning_rate": 0.019484277616415895, + "loss": 3.3582, + "mean_token_accuracy": 0.3783561587333679, + "num_tokens": 1276821443.0, + "step": 2498 + }, + { + "epoch": 0.675770686857761, + "grad_norm": 3.03125, + "learning_rate": 0.01948375149281573, + "loss": 3.2424, + "mean_token_accuracy": 0.359836220741272, + "num_tokens": 1277310983.0, + "step": 2499 + }, + { + "epoch": 0.6760411032990806, + "grad_norm": 2.59375, + "learning_rate": 0.01948322510890912, + "loss": 3.4239, + "mean_token_accuracy": 0.37069171667099, + "num_tokens": 1277835126.0, + "step": 2500 + }, + { + "epoch": 0.6763115197404003, + "grad_norm": 3.078125, + "learning_rate": 0.019482698464712216, + "loss": 3.3405, + "mean_token_accuracy": 0.37532615661621094, + "num_tokens": 1278359216.0, + "step": 2501 + }, + { + "epoch": 0.6765819361817198, + "grad_norm": 2.703125, + "learning_rate": 0.019482171560241176, + "loss": 3.2686, + "mean_token_accuracy": 0.3874427080154419, + "num_tokens": 1278883386.0, + "step": 2502 + }, + { + "epoch": 0.6768523526230394, + "grad_norm": 3.875, + "learning_rate": 0.01948164439551217, + "loss": 3.3573, + "mean_token_accuracy": 0.35912537574768066, + "num_tokens": 1279391742.0, + "step": 2503 + }, + { + "epoch": 0.6771227690643591, + "grad_norm": 3.546875, + "learning_rate": 0.01948111697054137, + "loss": 3.5058, + "mean_token_accuracy": 0.3763924837112427, + "num_tokens": 1279915960.0, + "step": 2504 + }, + { + "epoch": 0.6773931855056787, + "grad_norm": 7.5625, + "learning_rate": 0.019480589285344962, + "loss": 3.2537, + "mean_token_accuracy": 0.42596134543418884, + "num_tokens": 1280440152.0, + "step": 2505 + }, + { + "epoch": 0.6776636019469984, + "grad_norm": 1.765625, + "learning_rate": 0.019480061339939136, + "loss": 3.2986, + "mean_token_accuracy": 0.3753129839897156, + "num_tokens": 1280942570.0, + "step": 2506 + }, + { + "epoch": 0.677934018388318, + "grad_norm": 3.578125, + "learning_rate": 0.019479533134340084, + "loss": 3.4219, + "mean_token_accuracy": 0.3762093782424927, + "num_tokens": 1281466751.0, + "step": 2507 + }, + { + "epoch": 0.6782044348296377, + "grad_norm": 2.78125, + "learning_rate": 0.019479004668564025, + "loss": 3.4114, + "mean_token_accuracy": 0.3909093737602234, + "num_tokens": 1281986451.0, + "step": 2508 + }, + { + "epoch": 0.6784748512709573, + "grad_norm": 4.34375, + "learning_rate": 0.019478475942627162, + "loss": 3.5563, + "mean_token_accuracy": 0.345214307308197, + "num_tokens": 1282510722.0, + "step": 2509 + }, + { + "epoch": 0.678745267712277, + "grad_norm": 3.515625, + "learning_rate": 0.019477946956545727, + "loss": 3.4021, + "mean_token_accuracy": 0.3761451244354248, + "num_tokens": 1283016524.0, + "step": 2510 + }, + { + "epoch": 0.6790156841535966, + "grad_norm": 13.0, + "learning_rate": 0.019477417710335947, + "loss": 15.8974, + "mean_token_accuracy": 0.0, + "num_tokens": 1283495864.0, + "step": 2511 + }, + { + "epoch": 0.6792861005949161, + "grad_norm": 8.0, + "learning_rate": 0.01947688820401406, + "loss": 4.0053, + "mean_token_accuracy": 0.3093865215778351, + "num_tokens": 1283985644.0, + "step": 2512 + }, + { + "epoch": 0.6795565170362358, + "grad_norm": 2.5625, + "learning_rate": 0.019476358437596316, + "loss": 3.4654, + "mean_token_accuracy": 0.34438544511795044, + "num_tokens": 1284509910.0, + "step": 2513 + }, + { + "epoch": 0.6798269334775554, + "grad_norm": 2.453125, + "learning_rate": 0.019475828411098964, + "loss": 3.265, + "mean_token_accuracy": 0.37323856353759766, + "num_tokens": 1284999762.0, + "step": 2514 + }, + { + "epoch": 0.680097349918875, + "grad_norm": 2.890625, + "learning_rate": 0.019475298124538274, + "loss": 3.4771, + "mean_token_accuracy": 0.37398383021354675, + "num_tokens": 1285507703.0, + "step": 2515 + }, + { + "epoch": 0.6803677663601947, + "grad_norm": 3.8125, + "learning_rate": 0.019474767577930513, + "loss": 3.5045, + "mean_token_accuracy": 0.31059542298316956, + "num_tokens": 1286031870.0, + "step": 2516 + }, + { + "epoch": 0.6806381828015143, + "grad_norm": 3.796875, + "learning_rate": 0.01947423677129196, + "loss": 3.3994, + "mean_token_accuracy": 0.385048508644104, + "num_tokens": 1286510906.0, + "step": 2517 + }, + { + "epoch": 0.680908599242834, + "grad_norm": 3.578125, + "learning_rate": 0.0194737057046389, + "loss": 3.603, + "mean_token_accuracy": 0.3480238616466522, + "num_tokens": 1287035188.0, + "step": 2518 + }, + { + "epoch": 0.6811790156841536, + "grad_norm": 3.21875, + "learning_rate": 0.01947317437798763, + "loss": 3.5128, + "mean_token_accuracy": 0.3575413227081299, + "num_tokens": 1287559472.0, + "step": 2519 + }, + { + "epoch": 0.6814494321254733, + "grad_norm": 3.203125, + "learning_rate": 0.019472642791354453, + "loss": 3.3414, + "mean_token_accuracy": 0.3747476637363434, + "num_tokens": 1288083714.0, + "step": 2520 + }, + { + "epoch": 0.6817198485667929, + "grad_norm": 2.796875, + "learning_rate": 0.01947211094475568, + "loss": 3.3507, + "mean_token_accuracy": 0.37695062160491943, + "num_tokens": 1288591290.0, + "step": 2521 + }, + { + "epoch": 0.6819902650081124, + "grad_norm": 3.15625, + "learning_rate": 0.019471578838207628, + "loss": 3.5963, + "mean_token_accuracy": 0.3778272271156311, + "num_tokens": 1289115467.0, + "step": 2522 + }, + { + "epoch": 0.6822606814494321, + "grad_norm": 3.671875, + "learning_rate": 0.019471046471726622, + "loss": 3.4765, + "mean_token_accuracy": 0.36804285645484924, + "num_tokens": 1289639720.0, + "step": 2523 + }, + { + "epoch": 0.6825310978907517, + "grad_norm": 3.234375, + "learning_rate": 0.019470513845329, + "loss": 3.2214, + "mean_token_accuracy": 0.39255815744400024, + "num_tokens": 1290132876.0, + "step": 2524 + }, + { + "epoch": 0.6828015143320714, + "grad_norm": 2.921875, + "learning_rate": 0.019469980959031102, + "loss": 3.429, + "mean_token_accuracy": 0.377200186252594, + "num_tokens": 1290657010.0, + "step": 2525 + }, + { + "epoch": 0.683071930773391, + "grad_norm": 3.125, + "learning_rate": 0.01946944781284928, + "loss": 3.532, + "mean_token_accuracy": 0.3643379509449005, + "num_tokens": 1291171265.0, + "step": 2526 + }, + { + "epoch": 0.6833423472147107, + "grad_norm": 3.1875, + "learning_rate": 0.019468914406799893, + "loss": 3.3996, + "mean_token_accuracy": 0.36283639073371887, + "num_tokens": 1291695520.0, + "step": 2527 + }, + { + "epoch": 0.6836127636560303, + "grad_norm": 2.78125, + "learning_rate": 0.019468380740899304, + "loss": 3.4119, + "mean_token_accuracy": 0.3697206974029541, + "num_tokens": 1292219686.0, + "step": 2528 + }, + { + "epoch": 0.6838831800973499, + "grad_norm": 2.921875, + "learning_rate": 0.01946784681516389, + "loss": 3.3826, + "mean_token_accuracy": 0.3707324266433716, + "num_tokens": 1292721876.0, + "step": 2529 + }, + { + "epoch": 0.6841535965386696, + "grad_norm": 4.25, + "learning_rate": 0.019467312629610038, + "loss": 3.481, + "mean_token_accuracy": 0.36133554577827454, + "num_tokens": 1293246104.0, + "step": 2530 + }, + { + "epoch": 0.6844240129799892, + "grad_norm": 89.5, + "learning_rate": 0.01946677818425413, + "loss": 29.496, + "mean_token_accuracy": 0.0, + "num_tokens": 1293641314.0, + "step": 2531 + }, + { + "epoch": 0.6846944294213089, + "grad_norm": 6.625, + "learning_rate": 0.01946624347911257, + "loss": 4.2205, + "mean_token_accuracy": 0.2755945324897766, + "num_tokens": 1294165553.0, + "step": 2532 + }, + { + "epoch": 0.6849648458626284, + "grad_norm": 2.953125, + "learning_rate": 0.01946570851420176, + "loss": 3.5951, + "mean_token_accuracy": 0.34953951835632324, + "num_tokens": 1294689770.0, + "step": 2533 + }, + { + "epoch": 0.685235262303948, + "grad_norm": 3.5625, + "learning_rate": 0.019465173289538118, + "loss": 3.4866, + "mean_token_accuracy": 0.35864120721817017, + "num_tokens": 1295213926.0, + "step": 2534 + }, + { + "epoch": 0.6855056787452677, + "grad_norm": 2.859375, + "learning_rate": 0.019464637805138067, + "loss": 3.5038, + "mean_token_accuracy": 0.3528861999511719, + "num_tokens": 1295738147.0, + "step": 2535 + }, + { + "epoch": 0.6857760951865873, + "grad_norm": 3.421875, + "learning_rate": 0.01946410206101803, + "loss": 3.4484, + "mean_token_accuracy": 0.36949288845062256, + "num_tokens": 1296189892.0, + "step": 2536 + }, + { + "epoch": 0.686046511627907, + "grad_norm": 2.671875, + "learning_rate": 0.019463566057194456, + "loss": 3.5639, + "mean_token_accuracy": 0.35991477966308594, + "num_tokens": 1296714107.0, + "step": 2537 + }, + { + "epoch": 0.6863169280692266, + "grad_norm": 4.125, + "learning_rate": 0.019463029793683785, + "loss": 3.5055, + "mean_token_accuracy": 0.3539281487464905, + "num_tokens": 1297238389.0, + "step": 2538 + }, + { + "epoch": 0.6865873445105463, + "grad_norm": 2.921875, + "learning_rate": 0.01946249327050247, + "loss": 3.2049, + "mean_token_accuracy": 0.37919890880584717, + "num_tokens": 1297762498.0, + "step": 2539 + }, + { + "epoch": 0.6868577609518659, + "grad_norm": 2.171875, + "learning_rate": 0.019461956487666972, + "loss": 3.4879, + "mean_token_accuracy": 0.37171852588653564, + "num_tokens": 1298286674.0, + "step": 2540 + }, + { + "epoch": 0.6871281773931855, + "grad_norm": 2.796875, + "learning_rate": 0.019461419445193768, + "loss": 3.4888, + "mean_token_accuracy": 0.3736114501953125, + "num_tokens": 1298810882.0, + "step": 2541 + }, + { + "epoch": 0.6873985938345052, + "grad_norm": 3.203125, + "learning_rate": 0.019460882143099333, + "loss": 3.5079, + "mean_token_accuracy": 0.3348579406738281, + "num_tokens": 1299335144.0, + "step": 2542 + }, + { + "epoch": 0.6876690102758247, + "grad_norm": 2.078125, + "learning_rate": 0.019460344581400148, + "loss": 3.273, + "mean_token_accuracy": 0.3725697994232178, + "num_tokens": 1299859364.0, + "step": 2543 + }, + { + "epoch": 0.6879394267171444, + "grad_norm": 2.84375, + "learning_rate": 0.019459806760112715, + "loss": 3.2944, + "mean_token_accuracy": 0.36549246311187744, + "num_tokens": 1300383400.0, + "step": 2544 + }, + { + "epoch": 0.688209843158464, + "grad_norm": 3.453125, + "learning_rate": 0.019459268679253527, + "loss": 3.5001, + "mean_token_accuracy": 0.3724074959754944, + "num_tokens": 1300907491.0, + "step": 2545 + }, + { + "epoch": 0.6884802595997837, + "grad_norm": 2.96875, + "learning_rate": 0.019458730338839103, + "loss": 3.1018, + "mean_token_accuracy": 0.3962283730506897, + "num_tokens": 1301423531.0, + "step": 2546 + }, + { + "epoch": 0.6887506760411033, + "grad_norm": 3.046875, + "learning_rate": 0.019458191738885956, + "loss": 3.4122, + "mean_token_accuracy": 0.377176433801651, + "num_tokens": 1301910560.0, + "step": 2547 + }, + { + "epoch": 0.6890210924824229, + "grad_norm": 3.171875, + "learning_rate": 0.019457652879410613, + "loss": 3.0588, + "mean_token_accuracy": 0.40753310918807983, + "num_tokens": 1302434815.0, + "step": 2548 + }, + { + "epoch": 0.6892915089237426, + "grad_norm": 3.78125, + "learning_rate": 0.019457113760429606, + "loss": 3.7685, + "mean_token_accuracy": 0.31759560108184814, + "num_tokens": 1302958982.0, + "step": 2549 + }, + { + "epoch": 0.6895619253650622, + "grad_norm": 3.671875, + "learning_rate": 0.01945657438195948, + "loss": 3.7114, + "mean_token_accuracy": 0.3533439636230469, + "num_tokens": 1303483217.0, + "step": 2550 + }, + { + "epoch": 0.6898323418063819, + "grad_norm": 424.0, + "learning_rate": 0.019456034744016783, + "loss": 40.5867, + "mean_token_accuracy": 2.8005679268972017e-05, + "num_tokens": 1304007477.0, + "step": 2551 + }, + { + "epoch": 0.6901027582477015, + "grad_norm": 7.53125, + "learning_rate": 0.019455494846618068, + "loss": 3.9424, + "mean_token_accuracy": 0.29692527651786804, + "num_tokens": 1304504030.0, + "step": 2552 + }, + { + "epoch": 0.690373174689021, + "grad_norm": 2.0625, + "learning_rate": 0.019454954689779907, + "loss": 3.4189, + "mean_token_accuracy": 0.37571072578430176, + "num_tokens": 1305028303.0, + "step": 2553 + }, + { + "epoch": 0.6906435911303407, + "grad_norm": 2.90625, + "learning_rate": 0.019454414273518874, + "loss": 3.4179, + "mean_token_accuracy": 0.37959644198417664, + "num_tokens": 1305552478.0, + "step": 2554 + }, + { + "epoch": 0.6909140075716603, + "grad_norm": 3.515625, + "learning_rate": 0.019453873597851546, + "loss": 3.5167, + "mean_token_accuracy": 0.3753942847251892, + "num_tokens": 1306076641.0, + "step": 2555 + }, + { + "epoch": 0.69118442401298, + "grad_norm": 3.40625, + "learning_rate": 0.019453332662794517, + "loss": 3.315, + "mean_token_accuracy": 0.37699997425079346, + "num_tokens": 1306537461.0, + "step": 2556 + }, + { + "epoch": 0.6914548404542996, + "grad_norm": 2.421875, + "learning_rate": 0.01945279146836438, + "loss": 3.4598, + "mean_token_accuracy": 0.36202993988990784, + "num_tokens": 1307061647.0, + "step": 2557 + }, + { + "epoch": 0.6917252568956193, + "grad_norm": 3.453125, + "learning_rate": 0.019452250014577743, + "loss": 3.3564, + "mean_token_accuracy": 0.3560352325439453, + "num_tokens": 1307585758.0, + "step": 2558 + }, + { + "epoch": 0.6919956733369389, + "grad_norm": 2.65625, + "learning_rate": 0.01945170830145122, + "loss": 3.2684, + "mean_token_accuracy": 0.37865468859672546, + "num_tokens": 1308110010.0, + "step": 2559 + }, + { + "epoch": 0.6922660897782585, + "grad_norm": 2.34375, + "learning_rate": 0.019451166329001433, + "loss": 3.4009, + "mean_token_accuracy": 0.3752806484699249, + "num_tokens": 1308634186.0, + "step": 2560 + }, + { + "epoch": 0.6925365062195782, + "grad_norm": 3.0, + "learning_rate": 0.019450624097245007, + "loss": 3.5092, + "mean_token_accuracy": 0.3555166721343994, + "num_tokens": 1309158249.0, + "step": 2561 + }, + { + "epoch": 0.6928069226608978, + "grad_norm": 3.609375, + "learning_rate": 0.019450081606198582, + "loss": 3.2452, + "mean_token_accuracy": 0.36266908049583435, + "num_tokens": 1309682498.0, + "step": 2562 + }, + { + "epoch": 0.6930773391022174, + "grad_norm": 2.25, + "learning_rate": 0.019449538855878804, + "loss": 3.5851, + "mean_token_accuracy": 0.3858020603656769, + "num_tokens": 1310132147.0, + "step": 2563 + }, + { + "epoch": 0.693347755543537, + "grad_norm": 3.078125, + "learning_rate": 0.019448995846302327, + "loss": 3.3923, + "mean_token_accuracy": 0.3650892376899719, + "num_tokens": 1310656360.0, + "step": 2564 + }, + { + "epoch": 0.6936181719848566, + "grad_norm": 2.765625, + "learning_rate": 0.01944845257748581, + "loss": 3.2564, + "mean_token_accuracy": 0.3862278461456299, + "num_tokens": 1311137339.0, + "step": 2565 + }, + { + "epoch": 0.6938885884261763, + "grad_norm": 3.015625, + "learning_rate": 0.019447909049445922, + "loss": 3.5039, + "mean_token_accuracy": 0.37671002745628357, + "num_tokens": 1311661426.0, + "step": 2566 + }, + { + "epoch": 0.6941590048674959, + "grad_norm": 3.328125, + "learning_rate": 0.01944736526219934, + "loss": 3.4318, + "mean_token_accuracy": 0.35018599033355713, + "num_tokens": 1312143005.0, + "step": 2567 + }, + { + "epoch": 0.6944294213088156, + "grad_norm": 2.546875, + "learning_rate": 0.019446821215762747, + "loss": 3.5633, + "mean_token_accuracy": 0.33867403864860535, + "num_tokens": 1312667172.0, + "step": 2568 + }, + { + "epoch": 0.6946998377501352, + "grad_norm": 2.875, + "learning_rate": 0.019446276910152844, + "loss": 3.5323, + "mean_token_accuracy": 0.34915339946746826, + "num_tokens": 1313191340.0, + "step": 2569 + }, + { + "epoch": 0.6949702541914549, + "grad_norm": 3.4375, + "learning_rate": 0.019445732345386324, + "loss": 3.4663, + "mean_token_accuracy": 0.39327338337898254, + "num_tokens": 1313715567.0, + "step": 2570 + }, + { + "epoch": 0.6952406706327745, + "grad_norm": 2.09375, + "learning_rate": 0.0194451875214799, + "loss": 11.5843, + "mean_token_accuracy": 0.0, + "num_tokens": 1314239834.0, + "step": 2571 + }, + { + "epoch": 0.6955110870740941, + "grad_norm": 7.1875, + "learning_rate": 0.019444642438450285, + "loss": 4.0299, + "mean_token_accuracy": 0.30082404613494873, + "num_tokens": 1314755365.0, + "step": 2572 + }, + { + "epoch": 0.6957815035154138, + "grad_norm": 2.0, + "learning_rate": 0.019444097096314207, + "loss": 3.5299, + "mean_token_accuracy": 0.36642467975616455, + "num_tokens": 1315279533.0, + "step": 2573 + }, + { + "epoch": 0.6960519199567333, + "grad_norm": 3.875, + "learning_rate": 0.019443551495088397, + "loss": 3.4462, + "mean_token_accuracy": 0.36313340067863464, + "num_tokens": 1315800617.0, + "step": 2574 + }, + { + "epoch": 0.696322336398053, + "grad_norm": 3.78125, + "learning_rate": 0.0194430056347896, + "loss": 3.3859, + "mean_token_accuracy": 0.37035781145095825, + "num_tokens": 1316324774.0, + "step": 2575 + }, + { + "epoch": 0.6965927528393726, + "grad_norm": 3.375, + "learning_rate": 0.019442459515434555, + "loss": 3.4438, + "mean_token_accuracy": 0.36338746547698975, + "num_tokens": 1316849046.0, + "step": 2576 + }, + { + "epoch": 0.6968631692806923, + "grad_norm": 3.265625, + "learning_rate": 0.019441913137040027, + "loss": 3.2697, + "mean_token_accuracy": 0.3717855215072632, + "num_tokens": 1317373324.0, + "step": 2577 + }, + { + "epoch": 0.6971335857220119, + "grad_norm": 3.390625, + "learning_rate": 0.01944136649962278, + "loss": 3.3917, + "mean_token_accuracy": 0.376373827457428, + "num_tokens": 1317884806.0, + "step": 2578 + }, + { + "epoch": 0.6974040021633315, + "grad_norm": 3.546875, + "learning_rate": 0.019440819603199582, + "loss": 3.5543, + "mean_token_accuracy": 0.3733994960784912, + "num_tokens": 1318358130.0, + "step": 2579 + }, + { + "epoch": 0.6976744186046512, + "grad_norm": 4.25, + "learning_rate": 0.019440272447787215, + "loss": 3.7028, + "mean_token_accuracy": 0.34046441316604614, + "num_tokens": 1318882352.0, + "step": 2580 + }, + { + "epoch": 0.6979448350459708, + "grad_norm": 2.828125, + "learning_rate": 0.01943972503340247, + "loss": 3.1791, + "mean_token_accuracy": 0.39205873012542725, + "num_tokens": 1319406528.0, + "step": 2581 + }, + { + "epoch": 0.6982152514872905, + "grad_norm": 3.03125, + "learning_rate": 0.01943917736006214, + "loss": 3.3041, + "mean_token_accuracy": 0.38614708185195923, + "num_tokens": 1319930799.0, + "step": 2582 + }, + { + "epoch": 0.6984856679286101, + "grad_norm": 3.203125, + "learning_rate": 0.01943862942778303, + "loss": 3.2781, + "mean_token_accuracy": 0.37289077043533325, + "num_tokens": 1320455070.0, + "step": 2583 + }, + { + "epoch": 0.6987560843699296, + "grad_norm": 3.265625, + "learning_rate": 0.019438081236581955, + "loss": 3.302, + "mean_token_accuracy": 0.39449843764305115, + "num_tokens": 1320979166.0, + "step": 2584 + }, + { + "epoch": 0.6990265008112493, + "grad_norm": 7.40625, + "learning_rate": 0.019437532786475732, + "loss": 3.3554, + "mean_token_accuracy": 0.36768946051597595, + "num_tokens": 1321503413.0, + "step": 2585 + }, + { + "epoch": 0.6992969172525689, + "grad_norm": 1.5625, + "learning_rate": 0.01943698407748119, + "loss": 3.3588, + "mean_token_accuracy": 0.36436909437179565, + "num_tokens": 1322027633.0, + "step": 2586 + }, + { + "epoch": 0.6995673336938886, + "grad_norm": 3.390625, + "learning_rate": 0.019436435109615166, + "loss": 3.5565, + "mean_token_accuracy": 0.3712403476238251, + "num_tokens": 1322541807.0, + "step": 2587 + }, + { + "epoch": 0.6998377501352082, + "grad_norm": 2.953125, + "learning_rate": 0.0194358858828945, + "loss": 3.4123, + "mean_token_accuracy": 0.3895031809806824, + "num_tokens": 1323065970.0, + "step": 2588 + }, + { + "epoch": 0.7001081665765279, + "grad_norm": 3.671875, + "learning_rate": 0.019435336397336047, + "loss": 3.4252, + "mean_token_accuracy": 0.3439077138900757, + "num_tokens": 1323590142.0, + "step": 2589 + }, + { + "epoch": 0.7003785830178475, + "grad_norm": 2.3125, + "learning_rate": 0.019434786652956667, + "loss": 3.4361, + "mean_token_accuracy": 0.3788321614265442, + "num_tokens": 1324114339.0, + "step": 2590 + }, + { + "epoch": 0.7006489994591671, + "grad_norm": 86.5, + "learning_rate": 0.019434236649773232, + "loss": 18.2887, + "mean_token_accuracy": 0.03301598131656647, + "num_tokens": 1324638557.0, + "step": 2591 + }, + { + "epoch": 0.7009194159004868, + "grad_norm": 8.625, + "learning_rate": 0.019433686387802608, + "loss": 3.986, + "mean_token_accuracy": 0.29181593656539917, + "num_tokens": 1325162762.0, + "step": 2592 + }, + { + "epoch": 0.7011898323418064, + "grad_norm": 2.3125, + "learning_rate": 0.019433135867061684, + "loss": 3.5895, + "mean_token_accuracy": 0.35779091715812683, + "num_tokens": 1325686991.0, + "step": 2593 + }, + { + "epoch": 0.701460248783126, + "grad_norm": 2.890625, + "learning_rate": 0.01943258508756735, + "loss": 3.5209, + "mean_token_accuracy": 0.3585008382797241, + "num_tokens": 1326211273.0, + "step": 2594 + }, + { + "epoch": 0.7017306652244456, + "grad_norm": 3.78125, + "learning_rate": 0.019432034049336515, + "loss": 3.3278, + "mean_token_accuracy": 0.3836660385131836, + "num_tokens": 1326659831.0, + "step": 2595 + }, + { + "epoch": 0.7020010816657652, + "grad_norm": 2.625, + "learning_rate": 0.01943148275238607, + "loss": 3.2972, + "mean_token_accuracy": 0.39119285345077515, + "num_tokens": 1327171172.0, + "step": 2596 + }, + { + "epoch": 0.7022714981070849, + "grad_norm": 4.25, + "learning_rate": 0.01943093119673294, + "loss": 3.3464, + "mean_token_accuracy": 0.3486051857471466, + "num_tokens": 1327674142.0, + "step": 2597 + }, + { + "epoch": 0.7025419145484045, + "grad_norm": 3.109375, + "learning_rate": 0.01943037938239405, + "loss": 3.6847, + "mean_token_accuracy": 0.35419541597366333, + "num_tokens": 1328180827.0, + "step": 2598 + }, + { + "epoch": 0.7028123309897242, + "grad_norm": 3.25, + "learning_rate": 0.019429827309386326, + "loss": 3.4535, + "mean_token_accuracy": 0.36915895342826843, + "num_tokens": 1328637908.0, + "step": 2599 + }, + { + "epoch": 0.7030827474310438, + "grad_norm": 2.46875, + "learning_rate": 0.019429274977726715, + "loss": 3.2628, + "mean_token_accuracy": 0.3472215533256531, + "num_tokens": 1329162146.0, + "step": 2600 + }, + { + "epoch": 0.7033531638723635, + "grad_norm": 3.03125, + "learning_rate": 0.01942872238743215, + "loss": 3.3771, + "mean_token_accuracy": 0.35813993215560913, + "num_tokens": 1329686257.0, + "step": 2601 + }, + { + "epoch": 0.7036235803136831, + "grad_norm": 3.140625, + "learning_rate": 0.019428169538519603, + "loss": 3.439, + "mean_token_accuracy": 0.3700895607471466, + "num_tokens": 1330210473.0, + "step": 2602 + }, + { + "epoch": 0.7038939967550027, + "grad_norm": 3.3125, + "learning_rate": 0.019427616431006024, + "loss": 3.5872, + "mean_token_accuracy": 0.3390224874019623, + "num_tokens": 1330734709.0, + "step": 2603 + }, + { + "epoch": 0.7041644131963224, + "grad_norm": 3.0625, + "learning_rate": 0.01942706306490839, + "loss": 3.2208, + "mean_token_accuracy": 0.4140726327896118, + "num_tokens": 1331258990.0, + "step": 2604 + }, + { + "epoch": 0.7044348296376419, + "grad_norm": 2.921875, + "learning_rate": 0.01942650944024368, + "loss": 3.4404, + "mean_token_accuracy": 0.3591821789741516, + "num_tokens": 1331783164.0, + "step": 2605 + }, + { + "epoch": 0.7047052460789616, + "grad_norm": 4.59375, + "learning_rate": 0.01942595555702888, + "loss": 3.4183, + "mean_token_accuracy": 0.3665298521518707, + "num_tokens": 1332270196.0, + "step": 2606 + }, + { + "epoch": 0.7049756625202812, + "grad_norm": 2.59375, + "learning_rate": 0.01942540141528098, + "loss": 3.1204, + "mean_token_accuracy": 0.38500988483428955, + "num_tokens": 1332794371.0, + "step": 2607 + }, + { + "epoch": 0.7052460789616009, + "grad_norm": 2.890625, + "learning_rate": 0.01942484701501699, + "loss": 3.3495, + "mean_token_accuracy": 0.38000649213790894, + "num_tokens": 1333318457.0, + "step": 2608 + }, + { + "epoch": 0.7055164954029205, + "grad_norm": 3.03125, + "learning_rate": 0.019424292356253918, + "loss": 3.6207, + "mean_token_accuracy": 0.34931501746177673, + "num_tokens": 1333824443.0, + "step": 2609 + }, + { + "epoch": 0.7057869118442401, + "grad_norm": 3.15625, + "learning_rate": 0.019423737439008784, + "loss": 3.3703, + "mean_token_accuracy": 0.35802510380744934, + "num_tokens": 1334348595.0, + "step": 2610 + }, + { + "epoch": 0.7060573282855598, + "grad_norm": 157.0, + "learning_rate": 0.019423182263298613, + "loss": 12.9191, + "mean_token_accuracy": 3.8522397517226636e-05, + "num_tokens": 1334872792.0, + "step": 2611 + }, + { + "epoch": 0.7063277447268794, + "grad_norm": 6.8125, + "learning_rate": 0.019422626829140432, + "loss": 3.9372, + "mean_token_accuracy": 0.3100701570510864, + "num_tokens": 1335396875.0, + "step": 2612 + }, + { + "epoch": 0.7065981611681991, + "grad_norm": 2.234375, + "learning_rate": 0.019422071136551297, + "loss": 3.5753, + "mean_token_accuracy": 0.3481352627277374, + "num_tokens": 1335921153.0, + "step": 2613 + }, + { + "epoch": 0.7068685776095187, + "grad_norm": 2.984375, + "learning_rate": 0.01942151518554825, + "loss": 3.5346, + "mean_token_accuracy": 0.3520209789276123, + "num_tokens": 1336445418.0, + "step": 2614 + }, + { + "epoch": 0.7071389940508382, + "grad_norm": 3.4375, + "learning_rate": 0.01942095897614835, + "loss": 3.7161, + "mean_token_accuracy": 0.3289639949798584, + "num_tokens": 1336969563.0, + "step": 2615 + }, + { + "epoch": 0.7074094104921579, + "grad_norm": 3.578125, + "learning_rate": 0.01942040250836867, + "loss": 3.5444, + "mean_token_accuracy": 0.3517739772796631, + "num_tokens": 1337493779.0, + "step": 2616 + }, + { + "epoch": 0.7076798269334775, + "grad_norm": 4.0625, + "learning_rate": 0.01941984578222627, + "loss": 3.5464, + "mean_token_accuracy": 0.378248929977417, + "num_tokens": 1338017942.0, + "step": 2617 + }, + { + "epoch": 0.7079502433747972, + "grad_norm": 3.09375, + "learning_rate": 0.019419288797738247, + "loss": 3.4607, + "mean_token_accuracy": 0.36893725395202637, + "num_tokens": 1338477574.0, + "step": 2618 + }, + { + "epoch": 0.7082206598161168, + "grad_norm": 3.046875, + "learning_rate": 0.01941873155492168, + "loss": 3.4431, + "mean_token_accuracy": 0.3689843714237213, + "num_tokens": 1339001851.0, + "step": 2619 + }, + { + "epoch": 0.7084910762574365, + "grad_norm": 2.890625, + "learning_rate": 0.019418174053793674, + "loss": 3.6697, + "mean_token_accuracy": 0.335417777299881, + "num_tokens": 1339525967.0, + "step": 2620 + }, + { + "epoch": 0.7087614926987561, + "grad_norm": 2.78125, + "learning_rate": 0.019417616294371328, + "loss": 3.509, + "mean_token_accuracy": 0.36028334498405457, + "num_tokens": 1340049985.0, + "step": 2621 + }, + { + "epoch": 0.7090319091400757, + "grad_norm": 2.9375, + "learning_rate": 0.019417058276671763, + "loss": 3.5365, + "mean_token_accuracy": 0.3456782102584839, + "num_tokens": 1340574263.0, + "step": 2622 + }, + { + "epoch": 0.7093023255813954, + "grad_norm": 3.203125, + "learning_rate": 0.019416500000712097, + "loss": 3.3246, + "mean_token_accuracy": 0.3737713098526001, + "num_tokens": 1341098541.0, + "step": 2623 + }, + { + "epoch": 0.709572742022715, + "grad_norm": 2.296875, + "learning_rate": 0.01941594146650946, + "loss": 3.1984, + "mean_token_accuracy": 0.37100473046302795, + "num_tokens": 1341622807.0, + "step": 2624 + }, + { + "epoch": 0.7098431584640346, + "grad_norm": 3.078125, + "learning_rate": 0.019415382674080987, + "loss": 3.328, + "mean_token_accuracy": 0.3833531141281128, + "num_tokens": 1342147082.0, + "step": 2625 + }, + { + "epoch": 0.7101135749053542, + "grad_norm": 2.9375, + "learning_rate": 0.019414823623443826, + "loss": 3.3882, + "mean_token_accuracy": 0.3789554238319397, + "num_tokens": 1342671289.0, + "step": 2626 + }, + { + "epoch": 0.7103839913466738, + "grad_norm": 3.3125, + "learning_rate": 0.01941426431461513, + "loss": 3.3088, + "mean_token_accuracy": 0.3692198395729065, + "num_tokens": 1343166161.0, + "step": 2627 + }, + { + "epoch": 0.7106544077879935, + "grad_norm": 3.734375, + "learning_rate": 0.01941370474761206, + "loss": 3.5105, + "mean_token_accuracy": 0.3694918155670166, + "num_tokens": 1343690441.0, + "step": 2628 + }, + { + "epoch": 0.7109248242293131, + "grad_norm": 3.25, + "learning_rate": 0.019413144922451785, + "loss": 3.4269, + "mean_token_accuracy": 0.3446064591407776, + "num_tokens": 1344214646.0, + "step": 2629 + }, + { + "epoch": 0.7111952406706328, + "grad_norm": 2.90625, + "learning_rate": 0.01941258483915148, + "loss": 3.4974, + "mean_token_accuracy": 0.3764868974685669, + "num_tokens": 1344738928.0, + "step": 2630 + }, + { + "epoch": 0.7114656571119524, + "grad_norm": 480.0, + "learning_rate": 0.019412024497728338, + "loss": 38.7273, + "mean_token_accuracy": 3.083928459091112e-05, + "num_tokens": 1345243101.0, + "step": 2631 + }, + { + "epoch": 0.7117360735532721, + "grad_norm": 6.625, + "learning_rate": 0.019411463898199543, + "loss": 4.0539, + "mean_token_accuracy": 0.27994006872177124, + "num_tokens": 1345767212.0, + "step": 2632 + }, + { + "epoch": 0.7120064899945917, + "grad_norm": 2.859375, + "learning_rate": 0.019410903040582298, + "loss": 3.2221, + "mean_token_accuracy": 0.36314448714256287, + "num_tokens": 1346291488.0, + "step": 2633 + }, + { + "epoch": 0.7122769064359114, + "grad_norm": 3.015625, + "learning_rate": 0.019410341924893815, + "loss": 3.6548, + "mean_token_accuracy": 0.3385869860649109, + "num_tokens": 1346815713.0, + "step": 2634 + }, + { + "epoch": 0.7125473228772309, + "grad_norm": 2.875, + "learning_rate": 0.019409780551151307, + "loss": 3.4712, + "mean_token_accuracy": 0.36222079396247864, + "num_tokens": 1347339986.0, + "step": 2635 + }, + { + "epoch": 0.7128177393185505, + "grad_norm": 2.90625, + "learning_rate": 0.019409218919372, + "loss": 3.1417, + "mean_token_accuracy": 0.38960129022598267, + "num_tokens": 1347864236.0, + "step": 2636 + }, + { + "epoch": 0.7130881557598702, + "grad_norm": 2.6875, + "learning_rate": 0.01940865702957313, + "loss": 3.3256, + "mean_token_accuracy": 0.3709319531917572, + "num_tokens": 1348388340.0, + "step": 2637 + }, + { + "epoch": 0.7133585722011898, + "grad_norm": 2.765625, + "learning_rate": 0.01940809488177193, + "loss": 3.2689, + "mean_token_accuracy": 0.37617987394332886, + "num_tokens": 1348909212.0, + "step": 2638 + }, + { + "epoch": 0.7136289886425095, + "grad_norm": 3.078125, + "learning_rate": 0.019407532475985655, + "loss": 3.6562, + "mean_token_accuracy": 0.3422202467918396, + "num_tokens": 1349433440.0, + "step": 2639 + }, + { + "epoch": 0.7138994050838291, + "grad_norm": 2.859375, + "learning_rate": 0.01940696981223156, + "loss": 3.3921, + "mean_token_accuracy": 0.3738479018211365, + "num_tokens": 1349957695.0, + "step": 2640 + }, + { + "epoch": 0.7141698215251487, + "grad_norm": 2.71875, + "learning_rate": 0.019406406890526903, + "loss": 3.4001, + "mean_token_accuracy": 0.36670851707458496, + "num_tokens": 1350481956.0, + "step": 2641 + }, + { + "epoch": 0.7144402379664684, + "grad_norm": 3.234375, + "learning_rate": 0.019405843710888965, + "loss": 3.2403, + "mean_token_accuracy": 0.38274267315864563, + "num_tokens": 1350949876.0, + "step": 2642 + }, + { + "epoch": 0.714710654407788, + "grad_norm": 2.78125, + "learning_rate": 0.019405280273335018, + "loss": 3.4408, + "mean_token_accuracy": 0.36458295583724976, + "num_tokens": 1351474037.0, + "step": 2643 + }, + { + "epoch": 0.7149810708491077, + "grad_norm": 3.3125, + "learning_rate": 0.01940471657788236, + "loss": 3.439, + "mean_token_accuracy": 0.33392661809921265, + "num_tokens": 1351998266.0, + "step": 2644 + }, + { + "epoch": 0.7152514872904273, + "grad_norm": 1.984375, + "learning_rate": 0.01940415262454828, + "loss": 3.3097, + "mean_token_accuracy": 0.3847547471523285, + "num_tokens": 1352522378.0, + "step": 2645 + }, + { + "epoch": 0.7155219037317468, + "grad_norm": 3.109375, + "learning_rate": 0.019403588413350076, + "loss": 3.421, + "mean_token_accuracy": 0.37394434213638306, + "num_tokens": 1353046557.0, + "step": 2646 + }, + { + "epoch": 0.7157923201730665, + "grad_norm": 2.859375, + "learning_rate": 0.01940302394430507, + "loss": 3.3, + "mean_token_accuracy": 0.3827298879623413, + "num_tokens": 1353570714.0, + "step": 2647 + }, + { + "epoch": 0.7160627366143861, + "grad_norm": 2.671875, + "learning_rate": 0.019402459217430577, + "loss": 3.3969, + "mean_token_accuracy": 0.36779603362083435, + "num_tokens": 1354094985.0, + "step": 2648 + }, + { + "epoch": 0.7163331530557058, + "grad_norm": 2.578125, + "learning_rate": 0.019401894232743928, + "loss": 3.2834, + "mean_token_accuracy": 0.3832230269908905, + "num_tokens": 1354619210.0, + "step": 2649 + }, + { + "epoch": 0.7166035694970254, + "grad_norm": 2.609375, + "learning_rate": 0.01940132899026245, + "loss": 3.3277, + "mean_token_accuracy": 0.384319543838501, + "num_tokens": 1355137683.0, + "step": 2650 + }, + { + "epoch": 0.7168739859383451, + "grad_norm": 64.0, + "learning_rate": 0.019400763490003494, + "loss": 10.0587, + "mean_token_accuracy": 0.006119932048022747, + "num_tokens": 1355653872.0, + "step": 2651 + }, + { + "epoch": 0.7171444023796647, + "grad_norm": 11.9375, + "learning_rate": 0.01940019773198441, + "loss": 4.1846, + "mean_token_accuracy": 0.26060473918914795, + "num_tokens": 1356178138.0, + "step": 2652 + }, + { + "epoch": 0.7174148188209843, + "grad_norm": 2.703125, + "learning_rate": 0.01939963171622256, + "loss": 3.5052, + "mean_token_accuracy": 0.3567204475402832, + "num_tokens": 1356645057.0, + "step": 2653 + }, + { + "epoch": 0.717685235262304, + "grad_norm": 2.3125, + "learning_rate": 0.019399065442735303, + "loss": 3.4184, + "mean_token_accuracy": 0.35485488176345825, + "num_tokens": 1357169152.0, + "step": 2654 + }, + { + "epoch": 0.7179556517036236, + "grad_norm": 2.484375, + "learning_rate": 0.01939849891154002, + "loss": 3.3749, + "mean_token_accuracy": 0.37202510237693787, + "num_tokens": 1357639725.0, + "step": 2655 + }, + { + "epoch": 0.7182260681449432, + "grad_norm": 2.625, + "learning_rate": 0.019397932122654088, + "loss": 3.2605, + "mean_token_accuracy": 0.37214571237564087, + "num_tokens": 1358163982.0, + "step": 2656 + }, + { + "epoch": 0.7184964845862628, + "grad_norm": 2.5, + "learning_rate": 0.019397365076094904, + "loss": 3.1336, + "mean_token_accuracy": 0.37750399112701416, + "num_tokens": 1358688146.0, + "step": 2657 + }, + { + "epoch": 0.7187669010275824, + "grad_norm": 3.75, + "learning_rate": 0.019396797771879866, + "loss": 3.4841, + "mean_token_accuracy": 0.37303560972213745, + "num_tokens": 1359212424.0, + "step": 2658 + }, + { + "epoch": 0.7190373174689021, + "grad_norm": 3.484375, + "learning_rate": 0.01939623021002638, + "loss": 3.3344, + "mean_token_accuracy": 0.39362552762031555, + "num_tokens": 1359637450.0, + "step": 2659 + }, + { + "epoch": 0.7193077339102217, + "grad_norm": 3.3125, + "learning_rate": 0.019395662390551856, + "loss": 3.5937, + "mean_token_accuracy": 0.37561720609664917, + "num_tokens": 1360114074.0, + "step": 2660 + }, + { + "epoch": 0.7195781503515414, + "grad_norm": 4.4375, + "learning_rate": 0.01939509431347372, + "loss": 3.5242, + "mean_token_accuracy": 0.3662254214286804, + "num_tokens": 1360638232.0, + "step": 2661 + }, + { + "epoch": 0.719848566792861, + "grad_norm": 3.65625, + "learning_rate": 0.019394525978809404, + "loss": 3.2406, + "mean_token_accuracy": 0.39544111490249634, + "num_tokens": 1361162458.0, + "step": 2662 + }, + { + "epoch": 0.7201189832341807, + "grad_norm": 4.25, + "learning_rate": 0.019393957386576347, + "loss": 3.3342, + "mean_token_accuracy": 0.41311243176460266, + "num_tokens": 1361686713.0, + "step": 2663 + }, + { + "epoch": 0.7203893996755003, + "grad_norm": 3.703125, + "learning_rate": 0.01939338853679199, + "loss": 3.6394, + "mean_token_accuracy": 0.3497793972492218, + "num_tokens": 1362164106.0, + "step": 2664 + }, + { + "epoch": 0.72065981611682, + "grad_norm": 3.6875, + "learning_rate": 0.019392819429473785, + "loss": 3.1894, + "mean_token_accuracy": 0.43871188163757324, + "num_tokens": 1362567770.0, + "step": 2665 + }, + { + "epoch": 0.7209302325581395, + "grad_norm": 3.734375, + "learning_rate": 0.0193922500646392, + "loss": 3.3871, + "mean_token_accuracy": 0.36435666680336, + "num_tokens": 1363041975.0, + "step": 2666 + }, + { + "epoch": 0.7212006489994591, + "grad_norm": 2.890625, + "learning_rate": 0.019391680442305707, + "loss": 3.4022, + "mean_token_accuracy": 0.359468549489975, + "num_tokens": 1363566179.0, + "step": 2667 + }, + { + "epoch": 0.7214710654407788, + "grad_norm": 3.6875, + "learning_rate": 0.01939111056249078, + "loss": 3.5685, + "mean_token_accuracy": 0.3448789715766907, + "num_tokens": 1364090342.0, + "step": 2668 + }, + { + "epoch": 0.7217414818820984, + "grad_norm": 2.296875, + "learning_rate": 0.019390540425211898, + "loss": 3.3023, + "mean_token_accuracy": 0.38579362630844116, + "num_tokens": 1364609013.0, + "step": 2669 + }, + { + "epoch": 0.722011898323418, + "grad_norm": 3.375, + "learning_rate": 0.019389970030486564, + "loss": 3.3765, + "mean_token_accuracy": 0.38133504986763, + "num_tokens": 1365133276.0, + "step": 2670 + }, + { + "epoch": 0.7222823147647377, + "grad_norm": 35.0, + "learning_rate": 0.01938939937833228, + "loss": 11.5186, + "mean_token_accuracy": 9.034580216393806e-06, + "num_tokens": 1365657400.0, + "step": 2671 + }, + { + "epoch": 0.7225527312060573, + "grad_norm": 7.96875, + "learning_rate": 0.019388828468766547, + "loss": 4.0699, + "mean_token_accuracy": 0.2863035798072815, + "num_tokens": 1366181596.0, + "step": 2672 + }, + { + "epoch": 0.722823147647377, + "grad_norm": 2.171875, + "learning_rate": 0.019388257301806888, + "loss": 3.6176, + "mean_token_accuracy": 0.35324811935424805, + "num_tokens": 1366689771.0, + "step": 2673 + }, + { + "epoch": 0.7230935640886966, + "grad_norm": 3.109375, + "learning_rate": 0.019387685877470827, + "loss": 3.3877, + "mean_token_accuracy": 0.35362541675567627, + "num_tokens": 1367214000.0, + "step": 2674 + }, + { + "epoch": 0.7233639805300163, + "grad_norm": 2.796875, + "learning_rate": 0.019387114195775896, + "loss": 3.3245, + "mean_token_accuracy": 0.37004759907722473, + "num_tokens": 1367727927.0, + "step": 2675 + }, + { + "epoch": 0.7236343969713358, + "grad_norm": 2.796875, + "learning_rate": 0.019386542256739636, + "loss": 3.4917, + "mean_token_accuracy": 0.3628527522087097, + "num_tokens": 1368252115.0, + "step": 2676 + }, + { + "epoch": 0.7239048134126554, + "grad_norm": 3.625, + "learning_rate": 0.019385970060379596, + "loss": 3.4713, + "mean_token_accuracy": 0.38511842489242554, + "num_tokens": 1368717351.0, + "step": 2677 + }, + { + "epoch": 0.7241752298539751, + "grad_norm": 3.328125, + "learning_rate": 0.019385397606713334, + "loss": 3.2768, + "mean_token_accuracy": 0.3809208571910858, + "num_tokens": 1369217583.0, + "step": 2678 + }, + { + "epoch": 0.7244456462952947, + "grad_norm": 3.0, + "learning_rate": 0.019384824895758414, + "loss": 3.1912, + "mean_token_accuracy": 0.4071241021156311, + "num_tokens": 1369643326.0, + "step": 2679 + }, + { + "epoch": 0.7247160627366144, + "grad_norm": 2.90625, + "learning_rate": 0.019384251927532407, + "loss": 3.2601, + "mean_token_accuracy": 0.397043913602829, + "num_tokens": 1370107534.0, + "step": 2680 + }, + { + "epoch": 0.724986479177934, + "grad_norm": 4.40625, + "learning_rate": 0.0193836787020529, + "loss": 3.6868, + "mean_token_accuracy": 0.35043662786483765, + "num_tokens": 1370631624.0, + "step": 2681 + }, + { + "epoch": 0.7252568956192537, + "grad_norm": 3.46875, + "learning_rate": 0.019383105219337466, + "loss": 3.3176, + "mean_token_accuracy": 0.3938358426094055, + "num_tokens": 1371080504.0, + "step": 2682 + }, + { + "epoch": 0.7255273120605733, + "grad_norm": 2.5, + "learning_rate": 0.019382531479403716, + "loss": 3.4692, + "mean_token_accuracy": 0.36044031381607056, + "num_tokens": 1371604701.0, + "step": 2683 + }, + { + "epoch": 0.725797728501893, + "grad_norm": 2.96875, + "learning_rate": 0.019381957482269247, + "loss": 3.1911, + "mean_token_accuracy": 0.3803524672985077, + "num_tokens": 1372110547.0, + "step": 2684 + }, + { + "epoch": 0.7260681449432126, + "grad_norm": 2.609375, + "learning_rate": 0.019381383227951677, + "loss": 3.2657, + "mean_token_accuracy": 0.3894701302051544, + "num_tokens": 1372582189.0, + "step": 2685 + }, + { + "epoch": 0.7263385613845322, + "grad_norm": 3.171875, + "learning_rate": 0.019380808716468616, + "loss": 3.4618, + "mean_token_accuracy": 0.35124820470809937, + "num_tokens": 1373106440.0, + "step": 2686 + }, + { + "epoch": 0.7266089778258518, + "grad_norm": 2.65625, + "learning_rate": 0.0193802339478377, + "loss": 3.2705, + "mean_token_accuracy": 0.39254021644592285, + "num_tokens": 1373630596.0, + "step": 2687 + }, + { + "epoch": 0.7268793942671714, + "grad_norm": 3.703125, + "learning_rate": 0.019379658922076562, + "loss": 3.306, + "mean_token_accuracy": 0.3726350665092468, + "num_tokens": 1374129178.0, + "step": 2688 + }, + { + "epoch": 0.727149810708491, + "grad_norm": 2.921875, + "learning_rate": 0.019379083639202845, + "loss": 3.4529, + "mean_token_accuracy": 0.3868981599807739, + "num_tokens": 1374632575.0, + "step": 2689 + }, + { + "epoch": 0.7274202271498107, + "grad_norm": 4.0625, + "learning_rate": 0.0193785080992342, + "loss": 3.6299, + "mean_token_accuracy": 0.3314782977104187, + "num_tokens": 1375156771.0, + "step": 2690 + }, + { + "epoch": 0.7276906435911303, + "grad_norm": 58.0, + "learning_rate": 0.01937793230218829, + "loss": 28.7555, + "mean_token_accuracy": 0.034546248614788055, + "num_tokens": 1375661515.0, + "step": 2691 + }, + { + "epoch": 0.72796106003245, + "grad_norm": 9.375, + "learning_rate": 0.01937735624808278, + "loss": 4.1101, + "mean_token_accuracy": 0.3031056523323059, + "num_tokens": 1376185786.0, + "step": 2692 + }, + { + "epoch": 0.7282314764737696, + "grad_norm": 1.984375, + "learning_rate": 0.019376779936935335, + "loss": 3.496, + "mean_token_accuracy": 0.34989288449287415, + "num_tokens": 1376709917.0, + "step": 2693 + }, + { + "epoch": 0.7285018929150893, + "grad_norm": 2.75, + "learning_rate": 0.019376203368763653, + "loss": 3.3412, + "mean_token_accuracy": 0.35095155239105225, + "num_tokens": 1377234020.0, + "step": 2694 + }, + { + "epoch": 0.7287723093564089, + "grad_norm": 3.40625, + "learning_rate": 0.01937562654358542, + "loss": 3.4756, + "mean_token_accuracy": 0.3462032675743103, + "num_tokens": 1377752082.0, + "step": 2695 + }, + { + "epoch": 0.7290427257977286, + "grad_norm": 2.25, + "learning_rate": 0.019375049461418332, + "loss": 3.4457, + "mean_token_accuracy": 0.36340054869651794, + "num_tokens": 1378276336.0, + "step": 2696 + }, + { + "epoch": 0.7293131422390481, + "grad_norm": 3.546875, + "learning_rate": 0.019374472122280096, + "loss": 3.158, + "mean_token_accuracy": 0.4235912561416626, + "num_tokens": 1378777220.0, + "step": 2697 + }, + { + "epoch": 0.7295835586803677, + "grad_norm": 2.328125, + "learning_rate": 0.01937389452618843, + "loss": 3.0892, + "mean_token_accuracy": 0.4257051944732666, + "num_tokens": 1379301495.0, + "step": 2698 + }, + { + "epoch": 0.7298539751216874, + "grad_norm": 3.0625, + "learning_rate": 0.019373316673161053, + "loss": 3.4406, + "mean_token_accuracy": 0.33674463629722595, + "num_tokens": 1379825770.0, + "step": 2699 + }, + { + "epoch": 0.730124391563007, + "grad_norm": 2.546875, + "learning_rate": 0.019372738563215697, + "loss": 3.0978, + "mean_token_accuracy": 0.36541733145713806, + "num_tokens": 1380349941.0, + "step": 2700 + }, + { + "epoch": 0.7303948080043267, + "grad_norm": 3.078125, + "learning_rate": 0.019372160196370097, + "loss": 3.4202, + "mean_token_accuracy": 0.37589091062545776, + "num_tokens": 1380874163.0, + "step": 2701 + }, + { + "epoch": 0.7306652244456463, + "grad_norm": 3.25, + "learning_rate": 0.019371581572642002, + "loss": 3.5563, + "mean_token_accuracy": 0.3345300257205963, + "num_tokens": 1381398340.0, + "step": 2702 + }, + { + "epoch": 0.7309356408869659, + "grad_norm": 3.84375, + "learning_rate": 0.019371002692049163, + "loss": 3.5074, + "mean_token_accuracy": 0.35462698340415955, + "num_tokens": 1381922491.0, + "step": 2703 + }, + { + "epoch": 0.7312060573282856, + "grad_norm": 2.71875, + "learning_rate": 0.019370423554609346, + "loss": 3.4652, + "mean_token_accuracy": 0.3428320288658142, + "num_tokens": 1382397265.0, + "step": 2704 + }, + { + "epoch": 0.7314764737696052, + "grad_norm": 3.0, + "learning_rate": 0.01936984416034032, + "loss": 3.6095, + "mean_token_accuracy": 0.36787474155426025, + "num_tokens": 1382921501.0, + "step": 2705 + }, + { + "epoch": 0.7317468902109249, + "grad_norm": 3.421875, + "learning_rate": 0.019369264509259864, + "loss": 3.2685, + "mean_token_accuracy": 0.36476513743400574, + "num_tokens": 1383400271.0, + "step": 2706 + }, + { + "epoch": 0.7320173066522444, + "grad_norm": 4.125, + "learning_rate": 0.019368684601385754, + "loss": 3.1915, + "mean_token_accuracy": 0.3725668787956238, + "num_tokens": 1383911871.0, + "step": 2707 + }, + { + "epoch": 0.732287723093564, + "grad_norm": 3.421875, + "learning_rate": 0.01936810443673579, + "loss": 3.3619, + "mean_token_accuracy": 0.364301860332489, + "num_tokens": 1384436099.0, + "step": 2708 + }, + { + "epoch": 0.7325581395348837, + "grad_norm": 3.171875, + "learning_rate": 0.019367524015327774, + "loss": 3.2649, + "mean_token_accuracy": 0.38467055559158325, + "num_tokens": 1384960362.0, + "step": 2709 + }, + { + "epoch": 0.7328285559762033, + "grad_norm": 3.609375, + "learning_rate": 0.019366943337179518, + "loss": 3.4325, + "mean_token_accuracy": 0.35121411085128784, + "num_tokens": 1385484626.0, + "step": 2710 + }, + { + "epoch": 0.733098972417523, + "grad_norm": 266.0, + "learning_rate": 0.019366362402308834, + "loss": 14.7858, + "mean_token_accuracy": 0.0, + "num_tokens": 1386008872.0, + "step": 2711 + }, + { + "epoch": 0.7333693888588426, + "grad_norm": 8.25, + "learning_rate": 0.019365781210733544, + "loss": 4.081, + "mean_token_accuracy": 0.28565794229507446, + "num_tokens": 1386533092.0, + "step": 2712 + }, + { + "epoch": 0.7336398053001623, + "grad_norm": 2.234375, + "learning_rate": 0.019365199762471485, + "loss": 3.3839, + "mean_token_accuracy": 0.3528260290622711, + "num_tokens": 1387055512.0, + "step": 2713 + }, + { + "epoch": 0.7339102217414819, + "grad_norm": 2.4375, + "learning_rate": 0.019364618057540497, + "loss": 3.5692, + "mean_token_accuracy": 0.35191065073013306, + "num_tokens": 1387579700.0, + "step": 2714 + }, + { + "epoch": 0.7341806381828015, + "grad_norm": 3.1875, + "learning_rate": 0.01936403609595843, + "loss": 3.3266, + "mean_token_accuracy": 0.37991583347320557, + "num_tokens": 1388060487.0, + "step": 2715 + }, + { + "epoch": 0.7344510546241212, + "grad_norm": 2.140625, + "learning_rate": 0.019363453877743138, + "loss": 3.4506, + "mean_token_accuracy": 0.36542683839797974, + "num_tokens": 1388584731.0, + "step": 2716 + }, + { + "epoch": 0.7347214710654408, + "grad_norm": 3.3125, + "learning_rate": 0.01936287140291248, + "loss": 3.5455, + "mean_token_accuracy": 0.34281405806541443, + "num_tokens": 1389109008.0, + "step": 2717 + }, + { + "epoch": 0.7349918875067604, + "grad_norm": 2.59375, + "learning_rate": 0.019362288671484342, + "loss": 3.5045, + "mean_token_accuracy": 0.36833488941192627, + "num_tokens": 1389633111.0, + "step": 2718 + }, + { + "epoch": 0.73526230394808, + "grad_norm": 2.828125, + "learning_rate": 0.019361705683476595, + "loss": 3.5317, + "mean_token_accuracy": 0.3608168959617615, + "num_tokens": 1390157332.0, + "step": 2719 + }, + { + "epoch": 0.7355327203893997, + "grad_norm": 2.96875, + "learning_rate": 0.019361122438907124, + "loss": 3.5051, + "mean_token_accuracy": 0.3794984221458435, + "num_tokens": 1390610249.0, + "step": 2720 + }, + { + "epoch": 0.7358031368307193, + "grad_norm": 3.671875, + "learning_rate": 0.01936053893779383, + "loss": 3.5416, + "mean_token_accuracy": 0.3498678505420685, + "num_tokens": 1391134520.0, + "step": 2721 + }, + { + "epoch": 0.7360735532720389, + "grad_norm": 3.203125, + "learning_rate": 0.01935995518015461, + "loss": 3.4455, + "mean_token_accuracy": 0.36019837856292725, + "num_tokens": 1391658789.0, + "step": 2722 + }, + { + "epoch": 0.7363439697133586, + "grad_norm": 3.34375, + "learning_rate": 0.019359371166007385, + "loss": 3.3842, + "mean_token_accuracy": 0.3562348783016205, + "num_tokens": 1392182973.0, + "step": 2723 + }, + { + "epoch": 0.7366143861546782, + "grad_norm": 2.203125, + "learning_rate": 0.019358786895370066, + "loss": 3.2226, + "mean_token_accuracy": 0.3695495128631592, + "num_tokens": 1392707239.0, + "step": 2724 + }, + { + "epoch": 0.7368848025959979, + "grad_norm": 2.65625, + "learning_rate": 0.01935820236826059, + "loss": 3.3504, + "mean_token_accuracy": 0.389130562543869, + "num_tokens": 1393222160.0, + "step": 2725 + }, + { + "epoch": 0.7371552190373175, + "grad_norm": 2.765625, + "learning_rate": 0.01935761758469688, + "loss": 3.4223, + "mean_token_accuracy": 0.37149810791015625, + "num_tokens": 1393746362.0, + "step": 2726 + }, + { + "epoch": 0.7374256354786372, + "grad_norm": 2.640625, + "learning_rate": 0.019357032544696885, + "loss": 3.3359, + "mean_token_accuracy": 0.39320844411849976, + "num_tokens": 1394270482.0, + "step": 2727 + }, + { + "epoch": 0.7376960519199567, + "grad_norm": 2.9375, + "learning_rate": 0.019356447248278556, + "loss": 3.5152, + "mean_token_accuracy": 0.3656961917877197, + "num_tokens": 1394794751.0, + "step": 2728 + }, + { + "epoch": 0.7379664683612763, + "grad_norm": 4.96875, + "learning_rate": 0.019355861695459848, + "loss": 3.1276, + "mean_token_accuracy": 0.4157163202762604, + "num_tokens": 1395318988.0, + "step": 2729 + }, + { + "epoch": 0.738236884802596, + "grad_norm": 2.234375, + "learning_rate": 0.019355275886258735, + "loss": 3.3044, + "mean_token_accuracy": 0.35570186376571655, + "num_tokens": 1395843242.0, + "step": 2730 + }, + { + "epoch": 0.7385073012439156, + "grad_norm": 189.0, + "learning_rate": 0.01935468982069318, + "loss": 11.5619, + "mean_token_accuracy": 0.0020755650475621223, + "num_tokens": 1396331611.0, + "step": 2731 + }, + { + "epoch": 0.7387777176852353, + "grad_norm": 7.9375, + "learning_rate": 0.01935410349878118, + "loss": 4.3544, + "mean_token_accuracy": 0.2403450310230255, + "num_tokens": 1396855768.0, + "step": 2732 + }, + { + "epoch": 0.7390481341265549, + "grad_norm": 4.4375, + "learning_rate": 0.01935351692054071, + "loss": 3.2957, + "mean_token_accuracy": 0.3731982111930847, + "num_tokens": 1397322807.0, + "step": 2733 + }, + { + "epoch": 0.7393185505678745, + "grad_norm": 3.203125, + "learning_rate": 0.01935293008598978, + "loss": 3.3351, + "mean_token_accuracy": 0.35859376192092896, + "num_tokens": 1397847022.0, + "step": 2734 + }, + { + "epoch": 0.7395889670091942, + "grad_norm": 2.40625, + "learning_rate": 0.019352342995146387, + "loss": 3.5108, + "mean_token_accuracy": 0.3702132999897003, + "num_tokens": 1398313825.0, + "step": 2735 + }, + { + "epoch": 0.7398593834505138, + "grad_norm": 2.609375, + "learning_rate": 0.019351755648028546, + "loss": 3.6531, + "mean_token_accuracy": 0.3533381223678589, + "num_tokens": 1398838093.0, + "step": 2736 + }, + { + "epoch": 0.7401297998918335, + "grad_norm": 3.046875, + "learning_rate": 0.019351168044654286, + "loss": 3.3321, + "mean_token_accuracy": 0.3803151249885559, + "num_tokens": 1399362288.0, + "step": 2737 + }, + { + "epoch": 0.740400216333153, + "grad_norm": 3.421875, + "learning_rate": 0.019350580185041627, + "loss": 3.4388, + "mean_token_accuracy": 0.3601878583431244, + "num_tokens": 1399854104.0, + "step": 2738 + }, + { + "epoch": 0.7406706327744726, + "grad_norm": 3.109375, + "learning_rate": 0.019349992069208614, + "loss": 3.3566, + "mean_token_accuracy": 0.3682897388935089, + "num_tokens": 1400378390.0, + "step": 2739 + }, + { + "epoch": 0.7409410492157923, + "grad_norm": 7.90625, + "learning_rate": 0.019349403697173286, + "loss": 3.5874, + "mean_token_accuracy": 0.35473427176475525, + "num_tokens": 1400902589.0, + "step": 2740 + }, + { + "epoch": 0.7412114656571119, + "grad_norm": 2.34375, + "learning_rate": 0.0193488150689537, + "loss": 3.414, + "mean_token_accuracy": 0.3645681142807007, + "num_tokens": 1401426816.0, + "step": 2741 + }, + { + "epoch": 0.7414818820984316, + "grad_norm": 4.96875, + "learning_rate": 0.019348226184567916, + "loss": 3.5715, + "mean_token_accuracy": 0.35438069701194763, + "num_tokens": 1401951092.0, + "step": 2742 + }, + { + "epoch": 0.7417522985397512, + "grad_norm": 2.703125, + "learning_rate": 0.019347637044034, + "loss": 3.3529, + "mean_token_accuracy": 0.3666905164718628, + "num_tokens": 1402475298.0, + "step": 2743 + }, + { + "epoch": 0.7420227149810709, + "grad_norm": 2.875, + "learning_rate": 0.019347047647370032, + "loss": 3.4324, + "mean_token_accuracy": 0.35854536294937134, + "num_tokens": 1402999549.0, + "step": 2744 + }, + { + "epoch": 0.7422931314223905, + "grad_norm": 3.453125, + "learning_rate": 0.019346457994594096, + "loss": 3.4462, + "mean_token_accuracy": 0.3689351975917816, + "num_tokens": 1403479027.0, + "step": 2745 + }, + { + "epoch": 0.7425635478637101, + "grad_norm": 2.828125, + "learning_rate": 0.01934586808572428, + "loss": 3.4592, + "mean_token_accuracy": 0.35928773880004883, + "num_tokens": 1404003187.0, + "step": 2746 + }, + { + "epoch": 0.7428339643050298, + "grad_norm": 3.34375, + "learning_rate": 0.019345277920778694, + "loss": 3.6812, + "mean_token_accuracy": 0.3461785316467285, + "num_tokens": 1404526391.0, + "step": 2747 + }, + { + "epoch": 0.7431043807463493, + "grad_norm": 3.390625, + "learning_rate": 0.019344687499775436, + "loss": 3.4913, + "mean_token_accuracy": 0.372854083776474, + "num_tokens": 1405050668.0, + "step": 2748 + }, + { + "epoch": 0.743374797187669, + "grad_norm": 3.96875, + "learning_rate": 0.019344096822732623, + "loss": 3.494, + "mean_token_accuracy": 0.36345046758651733, + "num_tokens": 1405574927.0, + "step": 2749 + }, + { + "epoch": 0.7436452136289886, + "grad_norm": 3.0, + "learning_rate": 0.01934350588966838, + "loss": 3.59, + "mean_token_accuracy": 0.35716402530670166, + "num_tokens": 1406099197.0, + "step": 2750 + }, + { + "epoch": 0.7439156300703083, + "grad_norm": 242.0, + "learning_rate": 0.019342914700600844, + "loss": 23.7429, + "mean_token_accuracy": 6.33488452876918e-05, + "num_tokens": 1406623369.0, + "step": 2751 + }, + { + "epoch": 0.7441860465116279, + "grad_norm": 6.09375, + "learning_rate": 0.01934232325554815, + "loss": 3.9335, + "mean_token_accuracy": 0.3312644958496094, + "num_tokens": 1407126510.0, + "step": 2752 + }, + { + "epoch": 0.7444564629529475, + "grad_norm": 2.484375, + "learning_rate": 0.019341731554528445, + "loss": 3.654, + "mean_token_accuracy": 0.35456734895706177, + "num_tokens": 1407650712.0, + "step": 2753 + }, + { + "epoch": 0.7447268793942672, + "grad_norm": 3.234375, + "learning_rate": 0.019341139597559884, + "loss": 3.425, + "mean_token_accuracy": 0.37082305550575256, + "num_tokens": 1408142502.0, + "step": 2754 + }, + { + "epoch": 0.7449972958355868, + "grad_norm": 2.453125, + "learning_rate": 0.01934054738466063, + "loss": 3.6186, + "mean_token_accuracy": 0.36209946870803833, + "num_tokens": 1408617286.0, + "step": 2755 + }, + { + "epoch": 0.7452677122769065, + "grad_norm": 2.75, + "learning_rate": 0.01933995491584885, + "loss": 3.3662, + "mean_token_accuracy": 0.3736324906349182, + "num_tokens": 1409141498.0, + "step": 2756 + }, + { + "epoch": 0.7455381287182261, + "grad_norm": 3.953125, + "learning_rate": 0.019339362191142736, + "loss": 3.5998, + "mean_token_accuracy": 0.3472417891025543, + "num_tokens": 1409640455.0, + "step": 2757 + }, + { + "epoch": 0.7458085451595458, + "grad_norm": 2.734375, + "learning_rate": 0.01933876921056046, + "loss": 3.2674, + "mean_token_accuracy": 0.38000571727752686, + "num_tokens": 1410164664.0, + "step": 2758 + }, + { + "epoch": 0.7460789616008653, + "grad_norm": 3.109375, + "learning_rate": 0.019338175974120225, + "loss": 3.4256, + "mean_token_accuracy": 0.3523056209087372, + "num_tokens": 1410642963.0, + "step": 2759 + }, + { + "epoch": 0.7463493780421849, + "grad_norm": 2.46875, + "learning_rate": 0.019337582481840226, + "loss": 3.3353, + "mean_token_accuracy": 0.39992356300354004, + "num_tokens": 1411088087.0, + "step": 2760 + }, + { + "epoch": 0.7466197944835046, + "grad_norm": 3.765625, + "learning_rate": 0.019336988733738677, + "loss": 3.5985, + "mean_token_accuracy": 0.3406347930431366, + "num_tokens": 1411612331.0, + "step": 2761 + }, + { + "epoch": 0.7468902109248242, + "grad_norm": 2.484375, + "learning_rate": 0.0193363947298338, + "loss": 3.2788, + "mean_token_accuracy": 0.3649566173553467, + "num_tokens": 1412136437.0, + "step": 2762 + }, + { + "epoch": 0.7471606273661439, + "grad_norm": 2.484375, + "learning_rate": 0.019335800470143816, + "loss": 3.3563, + "mean_token_accuracy": 0.3769698143005371, + "num_tokens": 1412660567.0, + "step": 2763 + }, + { + "epoch": 0.7474310438074635, + "grad_norm": 3.484375, + "learning_rate": 0.019335205954686956, + "loss": 3.4423, + "mean_token_accuracy": 0.3777792453765869, + "num_tokens": 1413141324.0, + "step": 2764 + }, + { + "epoch": 0.7477014602487831, + "grad_norm": 3.109375, + "learning_rate": 0.01933461118348147, + "loss": 3.1222, + "mean_token_accuracy": 0.42314696311950684, + "num_tokens": 1413665511.0, + "step": 2765 + }, + { + "epoch": 0.7479718766901028, + "grad_norm": 2.859375, + "learning_rate": 0.0193340161565456, + "loss": 3.1101, + "mean_token_accuracy": 0.3852894902229309, + "num_tokens": 1414173358.0, + "step": 2766 + }, + { + "epoch": 0.7482422931314224, + "grad_norm": 2.765625, + "learning_rate": 0.019333420873897604, + "loss": 3.4003, + "mean_token_accuracy": 0.38062000274658203, + "num_tokens": 1414601455.0, + "step": 2767 + }, + { + "epoch": 0.7485127095727421, + "grad_norm": 2.34375, + "learning_rate": 0.01933282533555575, + "loss": 3.3921, + "mean_token_accuracy": 0.37871259450912476, + "num_tokens": 1415124094.0, + "step": 2768 + }, + { + "epoch": 0.7487831260140616, + "grad_norm": 3.03125, + "learning_rate": 0.01933222954153831, + "loss": 3.4236, + "mean_token_accuracy": 0.3936101794242859, + "num_tokens": 1415614909.0, + "step": 2769 + }, + { + "epoch": 0.7490535424553812, + "grad_norm": 3.484375, + "learning_rate": 0.019331633491863568, + "loss": 3.6492, + "mean_token_accuracy": 0.3562466502189636, + "num_tokens": 1416105873.0, + "step": 2770 + }, + { + "epoch": 0.7493239588967009, + "grad_norm": 2.296875, + "learning_rate": 0.019331037186549802, + "loss": 11.6223, + "mean_token_accuracy": 2.6236252779199276e-06, + "num_tokens": 1416630022.0, + "step": 2771 + }, + { + "epoch": 0.7495943753380205, + "grad_norm": 9.9375, + "learning_rate": 0.019330440625615317, + "loss": 4.164, + "mean_token_accuracy": 0.27194279432296753, + "num_tokens": 1417115931.0, + "step": 2772 + }, + { + "epoch": 0.7498647917793402, + "grad_norm": 2.515625, + "learning_rate": 0.01932984380907842, + "loss": 3.3682, + "mean_token_accuracy": 0.36045700311660767, + "num_tokens": 1417640159.0, + "step": 2773 + }, + { + "epoch": 0.7501352082206598, + "grad_norm": 4.5625, + "learning_rate": 0.019329246736957413, + "loss": 3.536, + "mean_token_accuracy": 0.3550686240196228, + "num_tokens": 1418164341.0, + "step": 2774 + }, + { + "epoch": 0.7504056246619795, + "grad_norm": 4.0, + "learning_rate": 0.01932864940927062, + "loss": 3.5612, + "mean_token_accuracy": 0.3381536304950714, + "num_tokens": 1418688620.0, + "step": 2775 + }, + { + "epoch": 0.7506760411032991, + "grad_norm": 3.328125, + "learning_rate": 0.019328051826036377, + "loss": 3.2992, + "mean_token_accuracy": 0.3558425307273865, + "num_tokens": 1419212841.0, + "step": 2776 + }, + { + "epoch": 0.7509464575446188, + "grad_norm": 3.0625, + "learning_rate": 0.019327453987273008, + "loss": 3.7734, + "mean_token_accuracy": 0.3372073471546173, + "num_tokens": 1419737094.0, + "step": 2777 + }, + { + "epoch": 0.7512168739859384, + "grad_norm": 3.1875, + "learning_rate": 0.019326855892998866, + "loss": 3.333, + "mean_token_accuracy": 0.34651944041252136, + "num_tokens": 1420261215.0, + "step": 2778 + }, + { + "epoch": 0.7514872904272579, + "grad_norm": 3.328125, + "learning_rate": 0.019326257543232292, + "loss": 3.4589, + "mean_token_accuracy": 0.3656734228134155, + "num_tokens": 1420770820.0, + "step": 2779 + }, + { + "epoch": 0.7517577068685776, + "grad_norm": 3.625, + "learning_rate": 0.01932565893799165, + "loss": 3.49, + "mean_token_accuracy": 0.3718626797199249, + "num_tokens": 1421242622.0, + "step": 2780 + }, + { + "epoch": 0.7520281233098972, + "grad_norm": 2.15625, + "learning_rate": 0.01932506007729531, + "loss": 3.2812, + "mean_token_accuracy": 0.3405919075012207, + "num_tokens": 1421766830.0, + "step": 2781 + }, + { + "epoch": 0.7522985397512169, + "grad_norm": 2.421875, + "learning_rate": 0.019324460961161645, + "loss": 3.5424, + "mean_token_accuracy": 0.36854031682014465, + "num_tokens": 1422291006.0, + "step": 2782 + }, + { + "epoch": 0.7525689561925365, + "grad_norm": 4.21875, + "learning_rate": 0.019323861589609036, + "loss": 3.3939, + "mean_token_accuracy": 0.3758124113082886, + "num_tokens": 1422815192.0, + "step": 2783 + }, + { + "epoch": 0.7528393726338561, + "grad_norm": 3.046875, + "learning_rate": 0.019323261962655876, + "loss": 3.2402, + "mean_token_accuracy": 0.39036449790000916, + "num_tokens": 1423297981.0, + "step": 2784 + }, + { + "epoch": 0.7531097890751758, + "grad_norm": 2.90625, + "learning_rate": 0.01932266208032056, + "loss": 3.5124, + "mean_token_accuracy": 0.35116809606552124, + "num_tokens": 1423822179.0, + "step": 2785 + }, + { + "epoch": 0.7533802055164954, + "grad_norm": 2.734375, + "learning_rate": 0.0193220619426215, + "loss": 3.4354, + "mean_token_accuracy": 0.3659265339374542, + "num_tokens": 1424346367.0, + "step": 2786 + }, + { + "epoch": 0.7536506219578151, + "grad_norm": 3.3125, + "learning_rate": 0.0193214615495771, + "loss": 3.5686, + "mean_token_accuracy": 0.3701040744781494, + "num_tokens": 1424870635.0, + "step": 2787 + }, + { + "epoch": 0.7539210383991347, + "grad_norm": 2.90625, + "learning_rate": 0.019320860901205793, + "loss": 3.418, + "mean_token_accuracy": 0.3765174150466919, + "num_tokens": 1425335547.0, + "step": 2788 + }, + { + "epoch": 0.7541914548404542, + "grad_norm": 4.03125, + "learning_rate": 0.019320259997526, + "loss": 3.1001, + "mean_token_accuracy": 0.3885747194290161, + "num_tokens": 1425807859.0, + "step": 2789 + }, + { + "epoch": 0.7544618712817739, + "grad_norm": 3.359375, + "learning_rate": 0.019319658838556162, + "loss": 3.3996, + "mean_token_accuracy": 0.373016357421875, + "num_tokens": 1426283537.0, + "step": 2790 + }, + { + "epoch": 0.7547322877230935, + "grad_norm": 49.0, + "learning_rate": 0.019319057424314726, + "loss": 14.8261, + "mean_token_accuracy": 0.009522615000605583, + "num_tokens": 1426807772.0, + "step": 2791 + }, + { + "epoch": 0.7550027041644132, + "grad_norm": 9.0, + "learning_rate": 0.019318455754820142, + "loss": 4.0561, + "mean_token_accuracy": 0.30771341919898987, + "num_tokens": 1427293991.0, + "step": 2792 + }, + { + "epoch": 0.7552731206057328, + "grad_norm": 2.578125, + "learning_rate": 0.019317853830090877, + "loss": 3.3968, + "mean_token_accuracy": 0.3709918260574341, + "num_tokens": 1427795470.0, + "step": 2793 + }, + { + "epoch": 0.7555435370470525, + "grad_norm": 3.578125, + "learning_rate": 0.01931725165014539, + "loss": 3.6444, + "mean_token_accuracy": 0.3571189045906067, + "num_tokens": 1428293742.0, + "step": 2794 + }, + { + "epoch": 0.7558139534883721, + "grad_norm": 3.625, + "learning_rate": 0.019316649215002164, + "loss": 3.6085, + "mean_token_accuracy": 0.36559903621673584, + "num_tokens": 1428818015.0, + "step": 2795 + }, + { + "epoch": 0.7560843699296917, + "grad_norm": 2.359375, + "learning_rate": 0.019316046524679685, + "loss": 3.2557, + "mean_token_accuracy": 0.36961162090301514, + "num_tokens": 1429342176.0, + "step": 2796 + }, + { + "epoch": 0.7563547863710114, + "grad_norm": 3.984375, + "learning_rate": 0.01931544357919644, + "loss": 3.3448, + "mean_token_accuracy": 0.3897242248058319, + "num_tokens": 1429822587.0, + "step": 2797 + }, + { + "epoch": 0.756625202812331, + "grad_norm": 2.015625, + "learning_rate": 0.01931484037857093, + "loss": 3.3104, + "mean_token_accuracy": 0.36988985538482666, + "num_tokens": 1430346603.0, + "step": 2798 + }, + { + "epoch": 0.7568956192536507, + "grad_norm": 3.25, + "learning_rate": 0.019314236922821672, + "loss": 3.4449, + "mean_token_accuracy": 0.3690339922904968, + "num_tokens": 1430870684.0, + "step": 2799 + }, + { + "epoch": 0.7571660356949702, + "grad_norm": 3.984375, + "learning_rate": 0.019313633211967166, + "loss": 3.6411, + "mean_token_accuracy": 0.3702065944671631, + "num_tokens": 1431343657.0, + "step": 2800 + }, + { + "epoch": 0.7574364521362899, + "grad_norm": 3.09375, + "learning_rate": 0.019313029246025952, + "loss": 3.5157, + "mean_token_accuracy": 0.35115015506744385, + "num_tokens": 1431867845.0, + "step": 2801 + }, + { + "epoch": 0.7577068685776095, + "grad_norm": 8.5625, + "learning_rate": 0.01931242502501655, + "loss": 3.4811, + "mean_token_accuracy": 0.36307841539382935, + "num_tokens": 1432392104.0, + "step": 2802 + }, + { + "epoch": 0.7579772850189291, + "grad_norm": 2.203125, + "learning_rate": 0.0193118205489575, + "loss": 3.6324, + "mean_token_accuracy": 0.34858477115631104, + "num_tokens": 1432916353.0, + "step": 2803 + }, + { + "epoch": 0.7582477014602488, + "grad_norm": 3.484375, + "learning_rate": 0.019311215817867357, + "loss": 3.249, + "mean_token_accuracy": 0.3644852042198181, + "num_tokens": 1433440516.0, + "step": 2804 + }, + { + "epoch": 0.7585181179015684, + "grad_norm": 2.890625, + "learning_rate": 0.01931061083176467, + "loss": 3.4338, + "mean_token_accuracy": 0.3508839011192322, + "num_tokens": 1433931021.0, + "step": 2805 + }, + { + "epoch": 0.7587885343428881, + "grad_norm": 2.765625, + "learning_rate": 0.019310005590668002, + "loss": 3.2421, + "mean_token_accuracy": 0.3676157295703888, + "num_tokens": 1434455199.0, + "step": 2806 + }, + { + "epoch": 0.7590589507842077, + "grad_norm": 3.484375, + "learning_rate": 0.019309400094595922, + "loss": 3.4593, + "mean_token_accuracy": 0.40552231669425964, + "num_tokens": 1434892377.0, + "step": 2807 + }, + { + "epoch": 0.7593293672255274, + "grad_norm": 2.515625, + "learning_rate": 0.019308794343567016, + "loss": 3.2431, + "mean_token_accuracy": 0.39203110337257385, + "num_tokens": 1435361510.0, + "step": 2808 + }, + { + "epoch": 0.759599783666847, + "grad_norm": 3.75, + "learning_rate": 0.019308188337599865, + "loss": 3.631, + "mean_token_accuracy": 0.35954350233078003, + "num_tokens": 1435885746.0, + "step": 2809 + }, + { + "epoch": 0.7598702001081665, + "grad_norm": 3.203125, + "learning_rate": 0.019307582076713056, + "loss": 3.5021, + "mean_token_accuracy": 0.36610060930252075, + "num_tokens": 1436410025.0, + "step": 2810 + }, + { + "epoch": 0.7601406165494862, + "grad_norm": 256.0, + "learning_rate": 0.019306975560925203, + "loss": 14.0206, + "mean_token_accuracy": 0.00018735809135250747, + "num_tokens": 1436913854.0, + "step": 2811 + }, + { + "epoch": 0.7604110329908058, + "grad_norm": 6.90625, + "learning_rate": 0.019306368790254907, + "loss": 4.1442, + "mean_token_accuracy": 0.31658267974853516, + "num_tokens": 1437329725.0, + "step": 2812 + }, + { + "epoch": 0.7606814494321255, + "grad_norm": 2.78125, + "learning_rate": 0.019305761764720795, + "loss": 3.7468, + "mean_token_accuracy": 0.3384854197502136, + "num_tokens": 1437853917.0, + "step": 2813 + }, + { + "epoch": 0.7609518658734451, + "grad_norm": 4.0625, + "learning_rate": 0.01930515448434148, + "loss": 3.6921, + "mean_token_accuracy": 0.3435071110725403, + "num_tokens": 1438350878.0, + "step": 2814 + }, + { + "epoch": 0.7612222823147647, + "grad_norm": 3.265625, + "learning_rate": 0.01930454694913561, + "loss": 3.5754, + "mean_token_accuracy": 0.35883957147598267, + "num_tokens": 1438875073.0, + "step": 2815 + }, + { + "epoch": 0.7614926987560844, + "grad_norm": 2.875, + "learning_rate": 0.019303939159121812, + "loss": 3.4719, + "mean_token_accuracy": 0.3607199192047119, + "num_tokens": 1439399225.0, + "step": 2816 + }, + { + "epoch": 0.761763115197404, + "grad_norm": 2.609375, + "learning_rate": 0.01930333111431874, + "loss": 3.3875, + "mean_token_accuracy": 0.37766844034194946, + "num_tokens": 1439867994.0, + "step": 2817 + }, + { + "epoch": 0.7620335316387237, + "grad_norm": 3.03125, + "learning_rate": 0.019302722814745047, + "loss": 3.6027, + "mean_token_accuracy": 0.3522224426269531, + "num_tokens": 1440392078.0, + "step": 2818 + }, + { + "epoch": 0.7623039480800433, + "grad_norm": 2.890625, + "learning_rate": 0.019302114260419407, + "loss": 3.5332, + "mean_token_accuracy": 0.374500572681427, + "num_tokens": 1440916269.0, + "step": 2819 + }, + { + "epoch": 0.7625743645213628, + "grad_norm": 2.671875, + "learning_rate": 0.019301505451360484, + "loss": 3.2186, + "mean_token_accuracy": 0.37150850892066956, + "num_tokens": 1441440466.0, + "step": 2820 + }, + { + "epoch": 0.7628447809626825, + "grad_norm": 2.90625, + "learning_rate": 0.019300896387586964, + "loss": 3.1809, + "mean_token_accuracy": 0.3784794807434082, + "num_tokens": 1441964665.0, + "step": 2821 + }, + { + "epoch": 0.7631151974040021, + "grad_norm": 2.765625, + "learning_rate": 0.019300287069117527, + "loss": 3.2314, + "mean_token_accuracy": 0.3731212317943573, + "num_tokens": 1442464583.0, + "step": 2822 + }, + { + "epoch": 0.7633856138453218, + "grad_norm": 3.21875, + "learning_rate": 0.019299677495970877, + "loss": 3.5279, + "mean_token_accuracy": 0.3677212595939636, + "num_tokens": 1442988852.0, + "step": 2823 + }, + { + "epoch": 0.7636560302866414, + "grad_norm": 2.96875, + "learning_rate": 0.01929906766816571, + "loss": 3.3768, + "mean_token_accuracy": 0.3800543248653412, + "num_tokens": 1443513079.0, + "step": 2824 + }, + { + "epoch": 0.7639264467279611, + "grad_norm": 2.859375, + "learning_rate": 0.019298457585720744, + "loss": 3.4749, + "mean_token_accuracy": 0.34424400329589844, + "num_tokens": 1444037119.0, + "step": 2825 + }, + { + "epoch": 0.7641968631692807, + "grad_norm": 2.875, + "learning_rate": 0.019297847248654694, + "loss": 3.4459, + "mean_token_accuracy": 0.3669757544994354, + "num_tokens": 1444553315.0, + "step": 2826 + }, + { + "epoch": 0.7644672796106003, + "grad_norm": 3.0, + "learning_rate": 0.019297236656986287, + "loss": 3.4161, + "mean_token_accuracy": 0.35803887248039246, + "num_tokens": 1445077593.0, + "step": 2827 + }, + { + "epoch": 0.76473769605192, + "grad_norm": 3.15625, + "learning_rate": 0.019296625810734262, + "loss": 3.1514, + "mean_token_accuracy": 0.3967134654521942, + "num_tokens": 1445601767.0, + "step": 2828 + }, + { + "epoch": 0.7650081124932396, + "grad_norm": 2.921875, + "learning_rate": 0.019296014709917358, + "loss": 3.4254, + "mean_token_accuracy": 0.36374396085739136, + "num_tokens": 1446126022.0, + "step": 2829 + }, + { + "epoch": 0.7652785289345592, + "grad_norm": 2.9375, + "learning_rate": 0.019295403354554323, + "loss": 3.4511, + "mean_token_accuracy": 0.3646872937679291, + "num_tokens": 1446650219.0, + "step": 2830 + }, + { + "epoch": 0.7655489453758788, + "grad_norm": 328.0, + "learning_rate": 0.019294791744663916, + "loss": 30.638, + "mean_token_accuracy": 8.29130076454021e-05, + "num_tokens": 1447174498.0, + "step": 2831 + }, + { + "epoch": 0.7658193618171985, + "grad_norm": 8.6875, + "learning_rate": 0.01929417988026491, + "loss": 4.2084, + "mean_token_accuracy": 0.29977887868881226, + "num_tokens": 1447698717.0, + "step": 2832 + }, + { + "epoch": 0.7660897782585181, + "grad_norm": 2.9375, + "learning_rate": 0.01929356776137607, + "loss": 3.5105, + "mean_token_accuracy": 0.32613763213157654, + "num_tokens": 1448222861.0, + "step": 2833 + }, + { + "epoch": 0.7663601946998377, + "grad_norm": 2.96875, + "learning_rate": 0.019292955388016186, + "loss": 3.7729, + "mean_token_accuracy": 0.3529950976371765, + "num_tokens": 1448690867.0, + "step": 2834 + }, + { + "epoch": 0.7666306111411574, + "grad_norm": 2.734375, + "learning_rate": 0.01929234276020404, + "loss": 3.4912, + "mean_token_accuracy": 0.35236769914627075, + "num_tokens": 1449214763.0, + "step": 2835 + }, + { + "epoch": 0.766901027582477, + "grad_norm": 2.703125, + "learning_rate": 0.019291729877958432, + "loss": 3.602, + "mean_token_accuracy": 0.35303640365600586, + "num_tokens": 1449738912.0, + "step": 2836 + }, + { + "epoch": 0.7671714440237967, + "grad_norm": 2.859375, + "learning_rate": 0.019291116741298166, + "loss": 3.4071, + "mean_token_accuracy": 0.37949874997138977, + "num_tokens": 1450262981.0, + "step": 2837 + }, + { + "epoch": 0.7674418604651163, + "grad_norm": 3.109375, + "learning_rate": 0.019290503350242055, + "loss": 3.4512, + "mean_token_accuracy": 0.36476564407348633, + "num_tokens": 1450787095.0, + "step": 2838 + }, + { + "epoch": 0.767712276906436, + "grad_norm": 3.015625, + "learning_rate": 0.01928988970480892, + "loss": 3.6311, + "mean_token_accuracy": 0.3712064027786255, + "num_tokens": 1451259932.0, + "step": 2839 + }, + { + "epoch": 0.7679826933477556, + "grad_norm": 3.640625, + "learning_rate": 0.019289275805017592, + "loss": 3.4367, + "mean_token_accuracy": 0.36749467253685, + "num_tokens": 1451732810.0, + "step": 2840 + }, + { + "epoch": 0.7682531097890751, + "grad_norm": 2.890625, + "learning_rate": 0.019288661650886902, + "loss": 3.4221, + "mean_token_accuracy": 0.36421769857406616, + "num_tokens": 1452257029.0, + "step": 2841 + }, + { + "epoch": 0.7685235262303948, + "grad_norm": 2.984375, + "learning_rate": 0.019288047242435706, + "loss": 3.2818, + "mean_token_accuracy": 0.3712800145149231, + "num_tokens": 1452781280.0, + "step": 2842 + }, + { + "epoch": 0.7687939426717144, + "grad_norm": 2.328125, + "learning_rate": 0.019287432579682837, + "loss": 3.3115, + "mean_token_accuracy": 0.38477689027786255, + "num_tokens": 1453305429.0, + "step": 2843 + }, + { + "epoch": 0.7690643591130341, + "grad_norm": 3.015625, + "learning_rate": 0.019286817662647168, + "loss": 3.5351, + "mean_token_accuracy": 0.34209322929382324, + "num_tokens": 1453829688.0, + "step": 2844 + }, + { + "epoch": 0.7693347755543537, + "grad_norm": 2.984375, + "learning_rate": 0.019286202491347566, + "loss": 3.2745, + "mean_token_accuracy": 0.3771905303001404, + "num_tokens": 1454353796.0, + "step": 2845 + }, + { + "epoch": 0.7696051919956733, + "grad_norm": 4.1875, + "learning_rate": 0.019285587065802898, + "loss": 3.6981, + "mean_token_accuracy": 0.3416188061237335, + "num_tokens": 1454878075.0, + "step": 2846 + }, + { + "epoch": 0.769875608436993, + "grad_norm": 4.03125, + "learning_rate": 0.01928497138603206, + "loss": 3.5914, + "mean_token_accuracy": 0.3670003116130829, + "num_tokens": 1455402241.0, + "step": 2847 + }, + { + "epoch": 0.7701460248783126, + "grad_norm": 2.421875, + "learning_rate": 0.01928435545205393, + "loss": 3.4057, + "mean_token_accuracy": 0.35964235663414, + "num_tokens": 1455926440.0, + "step": 2848 + }, + { + "epoch": 0.7704164413196323, + "grad_norm": 2.65625, + "learning_rate": 0.019283739263887413, + "loss": 3.4528, + "mean_token_accuracy": 0.34866392612457275, + "num_tokens": 1456450719.0, + "step": 2849 + }, + { + "epoch": 0.7706868577609519, + "grad_norm": 2.65625, + "learning_rate": 0.019283122821551413, + "loss": 3.4675, + "mean_token_accuracy": 0.35522204637527466, + "num_tokens": 1456966243.0, + "step": 2850 + }, + { + "epoch": 0.7709572742022714, + "grad_norm": 0.9375, + "learning_rate": 0.01928250612506485, + "loss": 11.1905, + "mean_token_accuracy": 1.4699618986924179e-05, + "num_tokens": 1457471871.0, + "step": 2851 + }, + { + "epoch": 0.7712276906435911, + "grad_norm": 7.375, + "learning_rate": 0.019281889174446638, + "loss": 4.1128, + "mean_token_accuracy": 0.29369279742240906, + "num_tokens": 1457995991.0, + "step": 2852 + }, + { + "epoch": 0.7714981070849107, + "grad_norm": 2.140625, + "learning_rate": 0.01928127196971571, + "loss": 3.5248, + "mean_token_accuracy": 0.3450625538825989, + "num_tokens": 1458520194.0, + "step": 2853 + }, + { + "epoch": 0.7717685235262304, + "grad_norm": 3.015625, + "learning_rate": 0.019280654510891004, + "loss": 3.3915, + "mean_token_accuracy": 0.3532676100730896, + "num_tokens": 1459044402.0, + "step": 2854 + }, + { + "epoch": 0.77203893996755, + "grad_norm": 3.25, + "learning_rate": 0.01928003679799147, + "loss": 3.4943, + "mean_token_accuracy": 0.3703708350658417, + "num_tokens": 1459563928.0, + "step": 2855 + }, + { + "epoch": 0.7723093564088697, + "grad_norm": 3.921875, + "learning_rate": 0.019279418831036055, + "loss": 3.5145, + "mean_token_accuracy": 0.3525775074958801, + "num_tokens": 1460088008.0, + "step": 2856 + }, + { + "epoch": 0.7725797728501893, + "grad_norm": 2.796875, + "learning_rate": 0.01927880061004372, + "loss": 3.5692, + "mean_token_accuracy": 0.34908515214920044, + "num_tokens": 1460612199.0, + "step": 2857 + }, + { + "epoch": 0.772850189291509, + "grad_norm": 3.625, + "learning_rate": 0.01927818213503344, + "loss": 3.5043, + "mean_token_accuracy": 0.3638814687728882, + "num_tokens": 1461124945.0, + "step": 2858 + }, + { + "epoch": 0.7731206057328286, + "grad_norm": 3.25, + "learning_rate": 0.019277563406024184, + "loss": 3.4249, + "mean_token_accuracy": 0.37773364782333374, + "num_tokens": 1461595439.0, + "step": 2859 + }, + { + "epoch": 0.7733910221741482, + "grad_norm": 3.25, + "learning_rate": 0.01927694442303494, + "loss": 3.3994, + "mean_token_accuracy": 0.377139151096344, + "num_tokens": 1462036282.0, + "step": 2860 + }, + { + "epoch": 0.7736614386154678, + "grad_norm": 3.484375, + "learning_rate": 0.019276325186084704, + "loss": 3.5257, + "mean_token_accuracy": 0.3534140884876251, + "num_tokens": 1462560539.0, + "step": 2861 + }, + { + "epoch": 0.7739318550567874, + "grad_norm": 3.46875, + "learning_rate": 0.01927570569519247, + "loss": 3.2641, + "mean_token_accuracy": 0.37746691703796387, + "num_tokens": 1463065022.0, + "step": 2862 + }, + { + "epoch": 0.774202271498107, + "grad_norm": 4.1875, + "learning_rate": 0.01927508595037725, + "loss": 3.5025, + "mean_token_accuracy": 0.352347195148468, + "num_tokens": 1463589203.0, + "step": 2863 + }, + { + "epoch": 0.7744726879394267, + "grad_norm": 2.625, + "learning_rate": 0.019274465951658052, + "loss": 3.515, + "mean_token_accuracy": 0.3605661988258362, + "num_tokens": 1464113396.0, + "step": 2864 + }, + { + "epoch": 0.7747431043807463, + "grad_norm": 3.921875, + "learning_rate": 0.01927384569905391, + "loss": 3.7185, + "mean_token_accuracy": 0.349966824054718, + "num_tokens": 1464637652.0, + "step": 2865 + }, + { + "epoch": 0.775013520822066, + "grad_norm": 3.3125, + "learning_rate": 0.01927322519258385, + "loss": 3.3722, + "mean_token_accuracy": 0.37543952465057373, + "num_tokens": 1465161864.0, + "step": 2866 + }, + { + "epoch": 0.7752839372633856, + "grad_norm": 3.40625, + "learning_rate": 0.01927260443226691, + "loss": 3.6027, + "mean_token_accuracy": 0.37144261598587036, + "num_tokens": 1465686057.0, + "step": 2867 + }, + { + "epoch": 0.7755543537047053, + "grad_norm": 2.796875, + "learning_rate": 0.019271983418122137, + "loss": 3.247, + "mean_token_accuracy": 0.37491607666015625, + "num_tokens": 1466157730.0, + "step": 2868 + }, + { + "epoch": 0.7758247701460249, + "grad_norm": 37.25, + "learning_rate": 0.019271362150168586, + "loss": 3.6049, + "mean_token_accuracy": 0.356129914522171, + "num_tokens": 1466681946.0, + "step": 2869 + }, + { + "epoch": 0.7760951865873446, + "grad_norm": 5.125, + "learning_rate": 0.01927074062842532, + "loss": 3.7126, + "mean_token_accuracy": 0.34110528230667114, + "num_tokens": 1467136493.0, + "step": 2870 + }, + { + "epoch": 0.7763656030286642, + "grad_norm": 4.875, + "learning_rate": 0.01927011885291141, + "loss": 10.1863, + "mean_token_accuracy": 0.0014655153499916196, + "num_tokens": 1467605446.0, + "step": 2871 + }, + { + "epoch": 0.7766360194699837, + "grad_norm": 8.75, + "learning_rate": 0.01926949682364593, + "loss": 4.0681, + "mean_token_accuracy": 0.31205591559410095, + "num_tokens": 1468129719.0, + "step": 2872 + }, + { + "epoch": 0.7769064359113034, + "grad_norm": 3.046875, + "learning_rate": 0.019268874540647973, + "loss": 3.6046, + "mean_token_accuracy": 0.3573121428489685, + "num_tokens": 1468653959.0, + "step": 2873 + }, + { + "epoch": 0.777176852352623, + "grad_norm": 3.34375, + "learning_rate": 0.01926825200393663, + "loss": 3.5535, + "mean_token_accuracy": 0.3554481863975525, + "num_tokens": 1469178214.0, + "step": 2874 + }, + { + "epoch": 0.7774472687939427, + "grad_norm": 3.65625, + "learning_rate": 0.019267629213530993, + "loss": 3.6593, + "mean_token_accuracy": 0.35182297229766846, + "num_tokens": 1469702366.0, + "step": 2875 + }, + { + "epoch": 0.7777176852352623, + "grad_norm": 2.609375, + "learning_rate": 0.01926700616945018, + "loss": 3.4988, + "mean_token_accuracy": 0.3637814521789551, + "num_tokens": 1470226614.0, + "step": 2876 + }, + { + "epoch": 0.777988101676582, + "grad_norm": 3.546875, + "learning_rate": 0.019266382871713308, + "loss": 3.4344, + "mean_token_accuracy": 0.363430380821228, + "num_tokens": 1470750874.0, + "step": 2877 + }, + { + "epoch": 0.7782585181179016, + "grad_norm": 10.375, + "learning_rate": 0.0192657593203395, + "loss": 3.3324, + "mean_token_accuracy": 0.38083845376968384, + "num_tokens": 1471275076.0, + "step": 2878 + }, + { + "epoch": 0.7785289345592212, + "grad_norm": 2.25, + "learning_rate": 0.019265135515347885, + "loss": 3.4347, + "mean_token_accuracy": 0.3602942228317261, + "num_tokens": 1471799142.0, + "step": 2879 + }, + { + "epoch": 0.7787993510005409, + "grad_norm": 3.28125, + "learning_rate": 0.01926451145675761, + "loss": 3.4307, + "mean_token_accuracy": 0.34459173679351807, + "num_tokens": 1472323421.0, + "step": 2880 + }, + { + "epoch": 0.7790697674418605, + "grad_norm": 2.953125, + "learning_rate": 0.019263887144587814, + "loss": 3.5312, + "mean_token_accuracy": 0.3574758470058441, + "num_tokens": 1472847555.0, + "step": 2881 + }, + { + "epoch": 0.77934018388318, + "grad_norm": 3.953125, + "learning_rate": 0.01926326257885766, + "loss": 3.5903, + "mean_token_accuracy": 0.3422773480415344, + "num_tokens": 1473371824.0, + "step": 2882 + }, + { + "epoch": 0.7796106003244997, + "grad_norm": 2.6875, + "learning_rate": 0.019262637759586313, + "loss": 3.5026, + "mean_token_accuracy": 0.36360201239585876, + "num_tokens": 1473896076.0, + "step": 2883 + }, + { + "epoch": 0.7798810167658193, + "grad_norm": 3.703125, + "learning_rate": 0.019262012686792937, + "loss": 3.5493, + "mean_token_accuracy": 0.36069783568382263, + "num_tokens": 1474389676.0, + "step": 2884 + }, + { + "epoch": 0.780151433207139, + "grad_norm": 2.5625, + "learning_rate": 0.019261387360496717, + "loss": 3.4515, + "mean_token_accuracy": 0.34947916865348816, + "num_tokens": 1474913928.0, + "step": 2885 + }, + { + "epoch": 0.7804218496484586, + "grad_norm": 2.765625, + "learning_rate": 0.019260761780716836, + "loss": 3.4622, + "mean_token_accuracy": 0.35679423809051514, + "num_tokens": 1475438206.0, + "step": 2886 + }, + { + "epoch": 0.7806922660897783, + "grad_norm": 3.546875, + "learning_rate": 0.019260135947472494, + "loss": 3.5719, + "mean_token_accuracy": 0.36449190974235535, + "num_tokens": 1475962370.0, + "step": 2887 + }, + { + "epoch": 0.7809626825310979, + "grad_norm": 4.78125, + "learning_rate": 0.019259509860782884, + "loss": 3.3209, + "mean_token_accuracy": 0.4020686745643616, + "num_tokens": 1476421215.0, + "step": 2888 + }, + { + "epoch": 0.7812330989724176, + "grad_norm": 1.8828125, + "learning_rate": 0.019258883520667226, + "loss": 3.2822, + "mean_token_accuracy": 0.3934967517852783, + "num_tokens": 1476937288.0, + "step": 2889 + }, + { + "epoch": 0.7815035154137372, + "grad_norm": 3.28125, + "learning_rate": 0.01925825692714473, + "loss": 3.2433, + "mean_token_accuracy": 0.36882033944129944, + "num_tokens": 1477461323.0, + "step": 2890 + }, + { + "epoch": 0.7817739318550568, + "grad_norm": 31.75, + "learning_rate": 0.019257630080234626, + "loss": 18.1465, + "mean_token_accuracy": 0.037805382162332535, + "num_tokens": 1477961308.0, + "step": 2891 + }, + { + "epoch": 0.7820443482963764, + "grad_norm": 5.875, + "learning_rate": 0.01925700297995615, + "loss": 3.8913, + "mean_token_accuracy": 0.32630422711372375, + "num_tokens": 1478485404.0, + "step": 2892 + }, + { + "epoch": 0.782314764737696, + "grad_norm": 1.7421875, + "learning_rate": 0.01925637562632854, + "loss": 3.4289, + "mean_token_accuracy": 0.38159316778182983, + "num_tokens": 1478998190.0, + "step": 2893 + }, + { + "epoch": 0.7825851811790157, + "grad_norm": 3.015625, + "learning_rate": 0.019255748019371046, + "loss": 3.5129, + "mean_token_accuracy": 0.3613239824771881, + "num_tokens": 1479522273.0, + "step": 2894 + }, + { + "epoch": 0.7828555976203353, + "grad_norm": 2.875, + "learning_rate": 0.01925512015910292, + "loss": 3.3507, + "mean_token_accuracy": 0.35933446884155273, + "num_tokens": 1480046548.0, + "step": 2895 + }, + { + "epoch": 0.7831260140616549, + "grad_norm": 3.25, + "learning_rate": 0.019254492045543435, + "loss": 3.3015, + "mean_token_accuracy": 0.3778511583805084, + "num_tokens": 1480545516.0, + "step": 2896 + }, + { + "epoch": 0.7833964305029746, + "grad_norm": 2.90625, + "learning_rate": 0.019253863678711857, + "loss": 3.4353, + "mean_token_accuracy": 0.3636857271194458, + "num_tokens": 1481069761.0, + "step": 2897 + }, + { + "epoch": 0.7836668469442942, + "grad_norm": 4.15625, + "learning_rate": 0.019253235058627468, + "loss": 3.4216, + "mean_token_accuracy": 0.3770291209220886, + "num_tokens": 1481567750.0, + "step": 2898 + }, + { + "epoch": 0.7839372633856139, + "grad_norm": 3.421875, + "learning_rate": 0.019252606185309554, + "loss": 3.5469, + "mean_token_accuracy": 0.3659589886665344, + "num_tokens": 1482091991.0, + "step": 2899 + }, + { + "epoch": 0.7842076798269335, + "grad_norm": 3.90625, + "learning_rate": 0.01925197705877742, + "loss": 3.6611, + "mean_token_accuracy": 0.33826956152915955, + "num_tokens": 1482616275.0, + "step": 2900 + }, + { + "epoch": 0.7844780962682532, + "grad_norm": 5.3125, + "learning_rate": 0.01925134767905036, + "loss": 3.1419, + "mean_token_accuracy": 0.4457795023918152, + "num_tokens": 1483016448.0, + "step": 2901 + }, + { + "epoch": 0.7847485127095727, + "grad_norm": 2.78125, + "learning_rate": 0.019250718046147683, + "loss": 3.2844, + "mean_token_accuracy": 0.3417096436023712, + "num_tokens": 1483540518.0, + "step": 2902 + }, + { + "epoch": 0.7850189291508923, + "grad_norm": 3.546875, + "learning_rate": 0.019250088160088715, + "loss": 3.6146, + "mean_token_accuracy": 0.336540549993515, + "num_tokens": 1484034378.0, + "step": 2903 + }, + { + "epoch": 0.785289345592212, + "grad_norm": 2.578125, + "learning_rate": 0.01924945802089278, + "loss": 3.4782, + "mean_token_accuracy": 0.37148624658584595, + "num_tokens": 1484523881.0, + "step": 2904 + }, + { + "epoch": 0.7855597620335316, + "grad_norm": 3.578125, + "learning_rate": 0.019248827628579216, + "loss": 3.4265, + "mean_token_accuracy": 0.3573753833770752, + "num_tokens": 1485048108.0, + "step": 2905 + }, + { + "epoch": 0.7858301784748513, + "grad_norm": 2.6875, + "learning_rate": 0.01924819698316736, + "loss": 3.4389, + "mean_token_accuracy": 0.3797379732131958, + "num_tokens": 1485522349.0, + "step": 2906 + }, + { + "epoch": 0.7861005949161709, + "grad_norm": 2.328125, + "learning_rate": 0.019247566084676565, + "loss": 3.217, + "mean_token_accuracy": 0.3751029968261719, + "num_tokens": 1486046485.0, + "step": 2907 + }, + { + "epoch": 0.7863710113574905, + "grad_norm": 2.171875, + "learning_rate": 0.019246934933126184, + "loss": 3.3785, + "mean_token_accuracy": 0.383444607257843, + "num_tokens": 1486570663.0, + "step": 2908 + }, + { + "epoch": 0.7866414277988102, + "grad_norm": 2.5625, + "learning_rate": 0.019246303528535593, + "loss": 3.3564, + "mean_token_accuracy": 0.3673546612262726, + "num_tokens": 1487094913.0, + "step": 2909 + }, + { + "epoch": 0.7869118442401298, + "grad_norm": 2.671875, + "learning_rate": 0.019245671870924155, + "loss": 3.2893, + "mean_token_accuracy": 0.3638598322868347, + "num_tokens": 1487619173.0, + "step": 2910 + }, + { + "epoch": 0.7871822606814495, + "grad_norm": 46.25, + "learning_rate": 0.019245039960311256, + "loss": 21.9142, + "mean_token_accuracy": 0.0, + "num_tokens": 1488143441.0, + "step": 2911 + }, + { + "epoch": 0.7874526771227691, + "grad_norm": 14.75, + "learning_rate": 0.019244407796716285, + "loss": 4.0939, + "mean_token_accuracy": 0.26994508504867554, + "num_tokens": 1488659317.0, + "step": 2912 + }, + { + "epoch": 0.7877230935640886, + "grad_norm": 2.84375, + "learning_rate": 0.01924377538015863, + "loss": 3.6322, + "mean_token_accuracy": 0.35038524866104126, + "num_tokens": 1489183549.0, + "step": 2913 + }, + { + "epoch": 0.7879935100054083, + "grad_norm": 2.796875, + "learning_rate": 0.019243142710657715, + "loss": 3.3132, + "mean_token_accuracy": 0.3292773365974426, + "num_tokens": 1489707737.0, + "step": 2914 + }, + { + "epoch": 0.7882639264467279, + "grad_norm": 2.640625, + "learning_rate": 0.019242509788232928, + "loss": 3.4081, + "mean_token_accuracy": 0.35939884185791016, + "num_tokens": 1490231901.0, + "step": 2915 + }, + { + "epoch": 0.7885343428880476, + "grad_norm": 3.25, + "learning_rate": 0.019241876612903712, + "loss": 3.1144, + "mean_token_accuracy": 0.4260728657245636, + "num_tokens": 1490756111.0, + "step": 2916 + }, + { + "epoch": 0.7888047593293672, + "grad_norm": 2.625, + "learning_rate": 0.019241243184689477, + "loss": 3.4812, + "mean_token_accuracy": 0.39205992221832275, + "num_tokens": 1491169204.0, + "step": 2917 + }, + { + "epoch": 0.7890751757706869, + "grad_norm": 3.46875, + "learning_rate": 0.019240609503609667, + "loss": 3.4601, + "mean_token_accuracy": 0.3580452799797058, + "num_tokens": 1491693381.0, + "step": 2918 + }, + { + "epoch": 0.7893455922120065, + "grad_norm": 3.28125, + "learning_rate": 0.019239975569683723, + "loss": 3.5503, + "mean_token_accuracy": 0.36258405447006226, + "num_tokens": 1492217544.0, + "step": 2919 + }, + { + "epoch": 0.7896160086533262, + "grad_norm": 3.296875, + "learning_rate": 0.0192393413829311, + "loss": 3.7501, + "mean_token_accuracy": 0.3300744295120239, + "num_tokens": 1492681954.0, + "step": 2920 + }, + { + "epoch": 0.7898864250946458, + "grad_norm": 2.71875, + "learning_rate": 0.019238706943371247, + "loss": 3.4445, + "mean_token_accuracy": 0.36765819787979126, + "num_tokens": 1493206230.0, + "step": 2921 + }, + { + "epoch": 0.7901568415359654, + "grad_norm": 3.0, + "learning_rate": 0.019238072251023638, + "loss": 3.3819, + "mean_token_accuracy": 0.37394624948501587, + "num_tokens": 1493730406.0, + "step": 2922 + }, + { + "epoch": 0.790427257977285, + "grad_norm": 2.875, + "learning_rate": 0.019237437305907747, + "loss": 3.3978, + "mean_token_accuracy": 0.3756006062030792, + "num_tokens": 1494254634.0, + "step": 2923 + }, + { + "epoch": 0.7906976744186046, + "grad_norm": 3.53125, + "learning_rate": 0.019236802108043058, + "loss": 3.404, + "mean_token_accuracy": 0.34958988428115845, + "num_tokens": 1494778905.0, + "step": 2924 + }, + { + "epoch": 0.7909680908599243, + "grad_norm": 3.234375, + "learning_rate": 0.019236166657449053, + "loss": 3.3581, + "mean_token_accuracy": 0.37159451842308044, + "num_tokens": 1495303096.0, + "step": 2925 + }, + { + "epoch": 0.7912385073012439, + "grad_norm": 3.234375, + "learning_rate": 0.019235530954145236, + "loss": 3.5011, + "mean_token_accuracy": 0.36383041739463806, + "num_tokens": 1495823929.0, + "step": 2926 + }, + { + "epoch": 0.7915089237425635, + "grad_norm": 2.796875, + "learning_rate": 0.01923489499815111, + "loss": 3.59, + "mean_token_accuracy": 0.37640154361724854, + "num_tokens": 1496348210.0, + "step": 2927 + }, + { + "epoch": 0.7917793401838832, + "grad_norm": 3.171875, + "learning_rate": 0.01923425878948619, + "loss": 3.5146, + "mean_token_accuracy": 0.354129433631897, + "num_tokens": 1496872461.0, + "step": 2928 + }, + { + "epoch": 0.7920497566252028, + "grad_norm": 2.8125, + "learning_rate": 0.01923362232816999, + "loss": 3.5019, + "mean_token_accuracy": 0.38045769929885864, + "num_tokens": 1497396710.0, + "step": 2929 + }, + { + "epoch": 0.7923201730665225, + "grad_norm": 2.765625, + "learning_rate": 0.019232985614222046, + "loss": 3.4807, + "mean_token_accuracy": 0.35262060165405273, + "num_tokens": 1497920814.0, + "step": 2930 + }, + { + "epoch": 0.7925905895078421, + "grad_norm": 12.3125, + "learning_rate": 0.01923234864766189, + "loss": 16.3717, + "mean_token_accuracy": 0.009242285043001175, + "num_tokens": 1498444936.0, + "step": 2931 + }, + { + "epoch": 0.7928610059491618, + "grad_norm": 4.875, + "learning_rate": 0.01923171142850907, + "loss": 3.8163, + "mean_token_accuracy": 0.3475899398326874, + "num_tokens": 1498926418.0, + "step": 2932 + }, + { + "epoch": 0.7931314223904813, + "grad_norm": 2.328125, + "learning_rate": 0.019231073956783137, + "loss": 3.5402, + "mean_token_accuracy": 0.3588588237762451, + "num_tokens": 1499450604.0, + "step": 2933 + }, + { + "epoch": 0.7934018388318009, + "grad_norm": 3.671875, + "learning_rate": 0.019230436232503645, + "loss": 3.3482, + "mean_token_accuracy": 0.3741194009780884, + "num_tokens": 1499974743.0, + "step": 2934 + }, + { + "epoch": 0.7936722552731206, + "grad_norm": 2.96875, + "learning_rate": 0.019229798255690166, + "loss": 3.5094, + "mean_token_accuracy": 0.35488057136535645, + "num_tokens": 1500498899.0, + "step": 2935 + }, + { + "epoch": 0.7939426717144402, + "grad_norm": 3.15625, + "learning_rate": 0.01922916002636228, + "loss": 3.3698, + "mean_token_accuracy": 0.36840662360191345, + "num_tokens": 1501023028.0, + "step": 2936 + }, + { + "epoch": 0.7942130881557599, + "grad_norm": 3.65625, + "learning_rate": 0.019228521544539558, + "loss": 3.5982, + "mean_token_accuracy": 0.3354009687900543, + "num_tokens": 1501547102.0, + "step": 2937 + }, + { + "epoch": 0.7944835045970795, + "grad_norm": 3.15625, + "learning_rate": 0.019227882810241598, + "loss": 3.4166, + "mean_token_accuracy": 0.3659120500087738, + "num_tokens": 1502071355.0, + "step": 2938 + }, + { + "epoch": 0.7947539210383991, + "grad_norm": 3.296875, + "learning_rate": 0.019227243823487998, + "loss": 3.1744, + "mean_token_accuracy": 0.3896077871322632, + "num_tokens": 1502595502.0, + "step": 2939 + }, + { + "epoch": 0.7950243374797188, + "grad_norm": 2.390625, + "learning_rate": 0.01922660458429836, + "loss": 3.5035, + "mean_token_accuracy": 0.3566368818283081, + "num_tokens": 1503119786.0, + "step": 2940 + }, + { + "epoch": 0.7952947539210384, + "grad_norm": 3.09375, + "learning_rate": 0.019225965092692303, + "loss": 3.021, + "mean_token_accuracy": 0.40755635499954224, + "num_tokens": 1503643941.0, + "step": 2941 + }, + { + "epoch": 0.7955651703623581, + "grad_norm": 4.25, + "learning_rate": 0.019225325348689443, + "loss": 3.6362, + "mean_token_accuracy": 0.3698731064796448, + "num_tokens": 1504138178.0, + "step": 2942 + }, + { + "epoch": 0.7958355868036776, + "grad_norm": 3.734375, + "learning_rate": 0.019224685352309416, + "loss": 3.3321, + "mean_token_accuracy": 0.3480517864227295, + "num_tokens": 1504662351.0, + "step": 2943 + }, + { + "epoch": 0.7961060032449973, + "grad_norm": 2.515625, + "learning_rate": 0.019224045103571855, + "loss": 3.3692, + "mean_token_accuracy": 0.3730711340904236, + "num_tokens": 1505186439.0, + "step": 2944 + }, + { + "epoch": 0.7963764196863169, + "grad_norm": 3.40625, + "learning_rate": 0.019223404602496403, + "loss": 3.3345, + "mean_token_accuracy": 0.3541889190673828, + "num_tokens": 1505710586.0, + "step": 2945 + }, + { + "epoch": 0.7966468361276365, + "grad_norm": 2.796875, + "learning_rate": 0.019222763849102712, + "loss": 3.2691, + "mean_token_accuracy": 0.3899637460708618, + "num_tokens": 1506234788.0, + "step": 2946 + }, + { + "epoch": 0.7969172525689562, + "grad_norm": 4.40625, + "learning_rate": 0.019222122843410447, + "loss": 3.4104, + "mean_token_accuracy": 0.39485588669776917, + "num_tokens": 1506690225.0, + "step": 2947 + }, + { + "epoch": 0.7971876690102758, + "grad_norm": 2.9375, + "learning_rate": 0.019221481585439276, + "loss": 3.4255, + "mean_token_accuracy": 0.3570687472820282, + "num_tokens": 1507205582.0, + "step": 2948 + }, + { + "epoch": 0.7974580854515955, + "grad_norm": 3.203125, + "learning_rate": 0.019220840075208866, + "loss": 3.3337, + "mean_token_accuracy": 0.39085787534713745, + "num_tokens": 1507729863.0, + "step": 2949 + }, + { + "epoch": 0.7977285018929151, + "grad_norm": 3.703125, + "learning_rate": 0.019220198312738912, + "loss": 3.6581, + "mean_token_accuracy": 0.32383763790130615, + "num_tokens": 1508254106.0, + "step": 2950 + }, + { + "epoch": 0.7979989183342348, + "grad_norm": 99.0, + "learning_rate": 0.019219556298049098, + "loss": 17.4951, + "mean_token_accuracy": 0.006088718771934509, + "num_tokens": 1508778371.0, + "step": 2951 + }, + { + "epoch": 0.7982693347755544, + "grad_norm": 5.25, + "learning_rate": 0.019218914031159124, + "loss": 3.9409, + "mean_token_accuracy": 0.34047871828079224, + "num_tokens": 1509302630.0, + "step": 2952 + }, + { + "epoch": 0.798539751216874, + "grad_norm": 2.21875, + "learning_rate": 0.0192182715120887, + "loss": 3.5015, + "mean_token_accuracy": 0.3683987855911255, + "num_tokens": 1509826874.0, + "step": 2953 + }, + { + "epoch": 0.7988101676581936, + "grad_norm": 2.65625, + "learning_rate": 0.019217628740857536, + "loss": 3.3513, + "mean_token_accuracy": 0.3670191168785095, + "num_tokens": 1510351153.0, + "step": 2954 + }, + { + "epoch": 0.7990805840995132, + "grad_norm": 3.703125, + "learning_rate": 0.019216985717485355, + "loss": 3.4717, + "mean_token_accuracy": 0.3988437056541443, + "num_tokens": 1510875240.0, + "step": 2955 + }, + { + "epoch": 0.7993510005408329, + "grad_norm": 2.65625, + "learning_rate": 0.01921634244199189, + "loss": 3.3604, + "mean_token_accuracy": 0.3884795904159546, + "num_tokens": 1511399440.0, + "step": 2956 + }, + { + "epoch": 0.7996214169821525, + "grad_norm": 2.46875, + "learning_rate": 0.01921569891439687, + "loss": 3.3296, + "mean_token_accuracy": 0.3805094063282013, + "num_tokens": 1511923621.0, + "step": 2957 + }, + { + "epoch": 0.7998918334234721, + "grad_norm": 2.984375, + "learning_rate": 0.019215055134720053, + "loss": 3.7074, + "mean_token_accuracy": 0.34050503373146057, + "num_tokens": 1512401611.0, + "step": 2958 + }, + { + "epoch": 0.8001622498647918, + "grad_norm": 3.0625, + "learning_rate": 0.019214411102981183, + "loss": 3.3635, + "mean_token_accuracy": 0.37945884466171265, + "num_tokens": 1512920254.0, + "step": 2959 + }, + { + "epoch": 0.8004326663061114, + "grad_norm": 3.1875, + "learning_rate": 0.019213766819200024, + "loss": 3.5705, + "mean_token_accuracy": 0.3569115102291107, + "num_tokens": 1513444449.0, + "step": 2960 + }, + { + "epoch": 0.8007030827474311, + "grad_norm": 2.6875, + "learning_rate": 0.019213122283396344, + "loss": 3.3992, + "mean_token_accuracy": 0.3724896311759949, + "num_tokens": 1513954818.0, + "step": 2961 + }, + { + "epoch": 0.8009734991887507, + "grad_norm": 2.609375, + "learning_rate": 0.01921247749558992, + "loss": 3.31, + "mean_token_accuracy": 0.3749113976955414, + "num_tokens": 1514456718.0, + "step": 2962 + }, + { + "epoch": 0.8012439156300704, + "grad_norm": 2.015625, + "learning_rate": 0.01921183245580053, + "loss": 3.4543, + "mean_token_accuracy": 0.38657844066619873, + "num_tokens": 1514929555.0, + "step": 2963 + }, + { + "epoch": 0.8015143320713899, + "grad_norm": 3.15625, + "learning_rate": 0.019211187164047978, + "loss": 3.3592, + "mean_token_accuracy": 0.3698018193244934, + "num_tokens": 1515453808.0, + "step": 2964 + }, + { + "epoch": 0.8017847485127095, + "grad_norm": 3.484375, + "learning_rate": 0.019210541620352052, + "loss": 3.3663, + "mean_token_accuracy": 0.4157005548477173, + "num_tokens": 1515906210.0, + "step": 2965 + }, + { + "epoch": 0.8020551649540292, + "grad_norm": 4.0625, + "learning_rate": 0.019209895824732565, + "loss": 3.2257, + "mean_token_accuracy": 0.36434298753738403, + "num_tokens": 1516430336.0, + "step": 2966 + }, + { + "epoch": 0.8023255813953488, + "grad_norm": 3.6875, + "learning_rate": 0.01920924977720933, + "loss": 3.6206, + "mean_token_accuracy": 0.3513255715370178, + "num_tokens": 1516954516.0, + "step": 2967 + }, + { + "epoch": 0.8025959978366685, + "grad_norm": 4.34375, + "learning_rate": 0.019208603477802167, + "loss": 3.5759, + "mean_token_accuracy": 0.36446642875671387, + "num_tokens": 1517461911.0, + "step": 2968 + }, + { + "epoch": 0.8028664142779881, + "grad_norm": 2.921875, + "learning_rate": 0.019207956926530913, + "loss": 3.2353, + "mean_token_accuracy": 0.36914151906967163, + "num_tokens": 1517986160.0, + "step": 2969 + }, + { + "epoch": 0.8031368307193077, + "grad_norm": 2.78125, + "learning_rate": 0.0192073101234154, + "loss": 3.2624, + "mean_token_accuracy": 0.39284747838974, + "num_tokens": 1518453238.0, + "step": 2970 + }, + { + "epoch": 0.8034072471606274, + "grad_norm": 8.625, + "learning_rate": 0.01920666306847548, + "loss": 10.7402, + "mean_token_accuracy": 0.006588927004486322, + "num_tokens": 1518977408.0, + "step": 2971 + }, + { + "epoch": 0.803677663601947, + "grad_norm": 14.25, + "learning_rate": 0.019206015761731, + "loss": 4.2756, + "mean_token_accuracy": 0.2796518802642822, + "num_tokens": 1519485392.0, + "step": 2972 + }, + { + "epoch": 0.8039480800432667, + "grad_norm": 2.3125, + "learning_rate": 0.019205368203201824, + "loss": 3.788, + "mean_token_accuracy": 0.31810569763183594, + "num_tokens": 1520009518.0, + "step": 2973 + }, + { + "epoch": 0.8042184964845862, + "grad_norm": 2.875, + "learning_rate": 0.019204720392907822, + "loss": 3.2194, + "mean_token_accuracy": 0.35613393783569336, + "num_tokens": 1520533646.0, + "step": 2974 + }, + { + "epoch": 0.8044889129259059, + "grad_norm": 3.640625, + "learning_rate": 0.019204072330868873, + "loss": 3.5407, + "mean_token_accuracy": 0.37396323680877686, + "num_tokens": 1520999793.0, + "step": 2975 + }, + { + "epoch": 0.8047593293672255, + "grad_norm": 3.9375, + "learning_rate": 0.019203424017104852, + "loss": 3.7267, + "mean_token_accuracy": 0.33720532059669495, + "num_tokens": 1521524061.0, + "step": 2976 + }, + { + "epoch": 0.8050297458085451, + "grad_norm": 2.984375, + "learning_rate": 0.01920277545163566, + "loss": 3.2916, + "mean_token_accuracy": 0.39464157819747925, + "num_tokens": 1522048242.0, + "step": 2977 + }, + { + "epoch": 0.8053001622498648, + "grad_norm": 3.125, + "learning_rate": 0.019202126634481197, + "loss": 3.5967, + "mean_token_accuracy": 0.3411217927932739, + "num_tokens": 1522572301.0, + "step": 2978 + }, + { + "epoch": 0.8055705786911844, + "grad_norm": 2.75, + "learning_rate": 0.019201477565661366, + "loss": 3.3898, + "mean_token_accuracy": 0.3843947649002075, + "num_tokens": 1523067422.0, + "step": 2979 + }, + { + "epoch": 0.8058409951325041, + "grad_norm": 3.59375, + "learning_rate": 0.019200828245196082, + "loss": 3.4631, + "mean_token_accuracy": 0.34731507301330566, + "num_tokens": 1523591693.0, + "step": 2980 + }, + { + "epoch": 0.8061114115738237, + "grad_norm": 2.28125, + "learning_rate": 0.019200178673105275, + "loss": 3.4018, + "mean_token_accuracy": 0.3718505799770355, + "num_tokens": 1524094727.0, + "step": 2981 + }, + { + "epoch": 0.8063818280151434, + "grad_norm": 2.65625, + "learning_rate": 0.019199528849408867, + "loss": 3.3855, + "mean_token_accuracy": 0.37565693259239197, + "num_tokens": 1524618788.0, + "step": 2982 + }, + { + "epoch": 0.806652244456463, + "grad_norm": 3.09375, + "learning_rate": 0.0191988787741268, + "loss": 3.44, + "mean_token_accuracy": 0.37260642647743225, + "num_tokens": 1525090292.0, + "step": 2983 + }, + { + "epoch": 0.8069226608977826, + "grad_norm": 3.3125, + "learning_rate": 0.019198228447279024, + "loss": 3.5847, + "mean_token_accuracy": 0.37951165437698364, + "num_tokens": 1525555321.0, + "step": 2984 + }, + { + "epoch": 0.8071930773391022, + "grad_norm": 2.5, + "learning_rate": 0.019197577868885488, + "loss": 3.2747, + "mean_token_accuracy": 0.3723815977573395, + "num_tokens": 1526079507.0, + "step": 2985 + }, + { + "epoch": 0.8074634937804218, + "grad_norm": 3.09375, + "learning_rate": 0.01919692703896616, + "loss": 3.5359, + "mean_token_accuracy": 0.34532830119132996, + "num_tokens": 1526603561.0, + "step": 2986 + }, + { + "epoch": 0.8077339102217415, + "grad_norm": 3.390625, + "learning_rate": 0.019196275957541, + "loss": 3.5128, + "mean_token_accuracy": 0.35822176933288574, + "num_tokens": 1527127834.0, + "step": 2987 + }, + { + "epoch": 0.8080043266630611, + "grad_norm": 3.140625, + "learning_rate": 0.01919562462462999, + "loss": 3.3882, + "mean_token_accuracy": 0.37459123134613037, + "num_tokens": 1527592039.0, + "step": 2988 + }, + { + "epoch": 0.8082747431043807, + "grad_norm": 2.984375, + "learning_rate": 0.019194973040253117, + "loss": 3.3605, + "mean_token_accuracy": 0.3821169137954712, + "num_tokens": 1528116272.0, + "step": 2989 + }, + { + "epoch": 0.8085451595457004, + "grad_norm": 2.6875, + "learning_rate": 0.01919432120443037, + "loss": 3.4088, + "mean_token_accuracy": 0.3715609908103943, + "num_tokens": 1528633592.0, + "step": 2990 + }, + { + "epoch": 0.80881557598702, + "grad_norm": 135.0, + "learning_rate": 0.01919366911718175, + "loss": 21.872, + "mean_token_accuracy": 0.0, + "num_tokens": 1529157832.0, + "step": 2991 + }, + { + "epoch": 0.8090859924283397, + "grad_norm": 7.1875, + "learning_rate": 0.019193016778527267, + "loss": 4.0984, + "mean_token_accuracy": 0.28431516885757446, + "num_tokens": 1529682102.0, + "step": 2992 + }, + { + "epoch": 0.8093564088696593, + "grad_norm": 2.140625, + "learning_rate": 0.019192364188486934, + "loss": 3.3071, + "mean_token_accuracy": 0.37766700983047485, + "num_tokens": 1530180469.0, + "step": 2993 + }, + { + "epoch": 0.809626825310979, + "grad_norm": 2.703125, + "learning_rate": 0.019191711347080777, + "loss": 3.4657, + "mean_token_accuracy": 0.36611291766166687, + "num_tokens": 1530672523.0, + "step": 2994 + }, + { + "epoch": 0.8098972417522985, + "grad_norm": 3.03125, + "learning_rate": 0.019191058254328826, + "loss": 3.4352, + "mean_token_accuracy": 0.36455294489860535, + "num_tokens": 1531196624.0, + "step": 2995 + }, + { + "epoch": 0.8101676581936181, + "grad_norm": 2.671875, + "learning_rate": 0.01919040491025112, + "loss": 3.1921, + "mean_token_accuracy": 0.3934268355369568, + "num_tokens": 1531676559.0, + "step": 2996 + }, + { + "epoch": 0.8104380746349378, + "grad_norm": 3.375, + "learning_rate": 0.0191897513148677, + "loss": 3.5108, + "mean_token_accuracy": 0.34322383999824524, + "num_tokens": 1532200718.0, + "step": 2997 + }, + { + "epoch": 0.8107084910762574, + "grad_norm": 2.953125, + "learning_rate": 0.019189097468198628, + "loss": 3.3839, + "mean_token_accuracy": 0.3554407060146332, + "num_tokens": 1532724958.0, + "step": 2998 + }, + { + "epoch": 0.8109789075175771, + "grad_norm": 2.671875, + "learning_rate": 0.019188443370263965, + "loss": 3.3955, + "mean_token_accuracy": 0.37701836228370667, + "num_tokens": 1533214569.0, + "step": 2999 + }, + { + "epoch": 0.8112493239588967, + "grad_norm": 2.8125, + "learning_rate": 0.019187789021083774, + "loss": 3.3178, + "mean_token_accuracy": 0.3371281027793884, + "num_tokens": 1533738801.0, + "step": 3000 + }, + { + "epoch": 0.8115197404002163, + "grad_norm": 2.3125, + "learning_rate": 0.019187134420678143, + "loss": 3.2228, + "mean_token_accuracy": 0.39726197719573975, + "num_tokens": 1534259549.0, + "step": 3001 + }, + { + "epoch": 0.811790156841536, + "grad_norm": 2.78125, + "learning_rate": 0.019186479569067147, + "loss": 3.4603, + "mean_token_accuracy": 0.37112951278686523, + "num_tokens": 1534783713.0, + "step": 3002 + }, + { + "epoch": 0.8120605732828556, + "grad_norm": 3.1875, + "learning_rate": 0.019185824466270884, + "loss": 3.454, + "mean_token_accuracy": 0.3565137982368469, + "num_tokens": 1535307972.0, + "step": 3003 + }, + { + "epoch": 0.8123309897241753, + "grad_norm": 3.515625, + "learning_rate": 0.019185169112309452, + "loss": 3.4602, + "mean_token_accuracy": 0.3392719626426697, + "num_tokens": 1535832213.0, + "step": 3004 + }, + { + "epoch": 0.8126014061654948, + "grad_norm": 3.375, + "learning_rate": 0.019184513507202964, + "loss": 3.4065, + "mean_token_accuracy": 0.3701915740966797, + "num_tokens": 1536319191.0, + "step": 3005 + }, + { + "epoch": 0.8128718226068145, + "grad_norm": 3.015625, + "learning_rate": 0.01918385765097153, + "loss": 3.4268, + "mean_token_accuracy": 0.36742889881134033, + "num_tokens": 1536843350.0, + "step": 3006 + }, + { + "epoch": 0.8131422390481341, + "grad_norm": 3.71875, + "learning_rate": 0.019183201543635276, + "loss": 3.4204, + "mean_token_accuracy": 0.36694175004959106, + "num_tokens": 1537367369.0, + "step": 3007 + }, + { + "epoch": 0.8134126554894537, + "grad_norm": 31.375, + "learning_rate": 0.019182545185214336, + "loss": 3.5457, + "mean_token_accuracy": 0.33154571056365967, + "num_tokens": 1537891508.0, + "step": 3008 + }, + { + "epoch": 0.8136830719307734, + "grad_norm": 4.75, + "learning_rate": 0.019181888575728844, + "loss": 3.7181, + "mean_token_accuracy": 0.35392940044403076, + "num_tokens": 1538384383.0, + "step": 3009 + }, + { + "epoch": 0.813953488372093, + "grad_norm": 44.25, + "learning_rate": 0.01918123171519895, + "loss": 3.8012, + "mean_token_accuracy": 0.33542871475219727, + "num_tokens": 1538908517.0, + "step": 3010 + }, + { + "epoch": 0.8142239048134127, + "grad_norm": 348.0, + "learning_rate": 0.019180574603644807, + "loss": 17.1771, + "mean_token_accuracy": 0.0001414156286045909, + "num_tokens": 1539432762.0, + "step": 3011 + }, + { + "epoch": 0.8144943212547323, + "grad_norm": 8.125, + "learning_rate": 0.01917991724108658, + "loss": 4.4156, + "mean_token_accuracy": 0.26300761103630066, + "num_tokens": 1539893205.0, + "step": 3012 + }, + { + "epoch": 0.814764737696052, + "grad_norm": 4.625, + "learning_rate": 0.019179259627544434, + "loss": 3.5275, + "mean_token_accuracy": 0.34865623712539673, + "num_tokens": 1540406047.0, + "step": 3013 + }, + { + "epoch": 0.8150351541373716, + "grad_norm": 3.484375, + "learning_rate": 0.01917860176303855, + "loss": 3.7151, + "mean_token_accuracy": 0.3406992256641388, + "num_tokens": 1540916144.0, + "step": 3014 + }, + { + "epoch": 0.8153055705786911, + "grad_norm": 12.5625, + "learning_rate": 0.01917794364758911, + "loss": 3.8918, + "mean_token_accuracy": 0.3080238699913025, + "num_tokens": 1541440425.0, + "step": 3015 + }, + { + "epoch": 0.8155759870200108, + "grad_norm": 2.765625, + "learning_rate": 0.01917728528121631, + "loss": 3.4703, + "mean_token_accuracy": 0.3862724304199219, + "num_tokens": 1541964640.0, + "step": 3016 + }, + { + "epoch": 0.8158464034613304, + "grad_norm": 2.671875, + "learning_rate": 0.019176626663940353, + "loss": 3.4886, + "mean_token_accuracy": 0.3642658591270447, + "num_tokens": 1542488743.0, + "step": 3017 + }, + { + "epoch": 0.8161168199026501, + "grad_norm": 4.03125, + "learning_rate": 0.019175967795781443, + "loss": 3.5396, + "mean_token_accuracy": 0.3545479476451874, + "num_tokens": 1543012994.0, + "step": 3018 + }, + { + "epoch": 0.8163872363439697, + "grad_norm": 2.515625, + "learning_rate": 0.019175308676759793, + "loss": 3.3747, + "mean_token_accuracy": 0.37956956028938293, + "num_tokens": 1543537047.0, + "step": 3019 + }, + { + "epoch": 0.8166576527852893, + "grad_norm": 4.5, + "learning_rate": 0.019174649306895636, + "loss": 3.5776, + "mean_token_accuracy": 0.3392826020717621, + "num_tokens": 1544061282.0, + "step": 3020 + }, + { + "epoch": 0.816928069226609, + "grad_norm": 2.375, + "learning_rate": 0.019173989686209193, + "loss": 3.1866, + "mean_token_accuracy": 0.39318740367889404, + "num_tokens": 1544537590.0, + "step": 3021 + }, + { + "epoch": 0.8171984856679286, + "grad_norm": 4.46875, + "learning_rate": 0.019173329814720714, + "loss": 3.369, + "mean_token_accuracy": 0.36580437421798706, + "num_tokens": 1545061777.0, + "step": 3022 + }, + { + "epoch": 0.8174689021092483, + "grad_norm": 3.828125, + "learning_rate": 0.019172669692450434, + "loss": 3.4653, + "mean_token_accuracy": 0.36248573660850525, + "num_tokens": 1545574866.0, + "step": 3023 + }, + { + "epoch": 0.8177393185505679, + "grad_norm": 3.234375, + "learning_rate": 0.019172009319418617, + "loss": 3.4073, + "mean_token_accuracy": 0.370788037776947, + "num_tokens": 1546099044.0, + "step": 3024 + }, + { + "epoch": 0.8180097349918876, + "grad_norm": 3.609375, + "learning_rate": 0.01917134869564552, + "loss": 3.5968, + "mean_token_accuracy": 0.34911084175109863, + "num_tokens": 1546567356.0, + "step": 3025 + }, + { + "epoch": 0.8182801514332071, + "grad_norm": 3.53125, + "learning_rate": 0.019170687821151416, + "loss": 3.422, + "mean_token_accuracy": 0.38050007820129395, + "num_tokens": 1547091345.0, + "step": 3026 + }, + { + "epoch": 0.8185505678745267, + "grad_norm": 3.09375, + "learning_rate": 0.019170026695956584, + "loss": 3.6476, + "mean_token_accuracy": 0.3495713770389557, + "num_tokens": 1547592037.0, + "step": 3027 + }, + { + "epoch": 0.8188209843158464, + "grad_norm": 2.40625, + "learning_rate": 0.019169365320081302, + "loss": 3.4846, + "mean_token_accuracy": 0.34880587458610535, + "num_tokens": 1548116238.0, + "step": 3028 + }, + { + "epoch": 0.819091400757166, + "grad_norm": 2.546875, + "learning_rate": 0.01916870369354587, + "loss": 3.5878, + "mean_token_accuracy": 0.35460853576660156, + "num_tokens": 1548640519.0, + "step": 3029 + }, + { + "epoch": 0.8193618171984857, + "grad_norm": 2.890625, + "learning_rate": 0.019168041816370586, + "loss": 3.4558, + "mean_token_accuracy": 0.3634932339191437, + "num_tokens": 1549164739.0, + "step": 3030 + }, + { + "epoch": 0.8196322336398053, + "grad_norm": 21.375, + "learning_rate": 0.019167379688575757, + "loss": 10.1916, + "mean_token_accuracy": 0.0017177672125399113, + "num_tokens": 1549672078.0, + "step": 3031 + }, + { + "epoch": 0.819902650081125, + "grad_norm": 6.875, + "learning_rate": 0.0191667173101817, + "loss": 4.0104, + "mean_token_accuracy": 0.3220348656177521, + "num_tokens": 1550196255.0, + "step": 3032 + }, + { + "epoch": 0.8201730665224446, + "grad_norm": 2.375, + "learning_rate": 0.01916605468120874, + "loss": 3.4389, + "mean_token_accuracy": 0.35483306646347046, + "num_tokens": 1550720411.0, + "step": 3033 + }, + { + "epoch": 0.8204434829637642, + "grad_norm": 2.4375, + "learning_rate": 0.01916539180167721, + "loss": 3.2939, + "mean_token_accuracy": 0.3812757134437561, + "num_tokens": 1551244522.0, + "step": 3034 + }, + { + "epoch": 0.8207138994050839, + "grad_norm": 2.921875, + "learning_rate": 0.019164728671607442, + "loss": 3.3266, + "mean_token_accuracy": 0.3605799376964569, + "num_tokens": 1551768693.0, + "step": 3035 + }, + { + "epoch": 0.8209843158464034, + "grad_norm": 3.25, + "learning_rate": 0.019164065291019796, + "loss": 3.4729, + "mean_token_accuracy": 0.37823328375816345, + "num_tokens": 1552252073.0, + "step": 3036 + }, + { + "epoch": 0.821254732287723, + "grad_norm": 3.734375, + "learning_rate": 0.01916340165993461, + "loss": 3.6292, + "mean_token_accuracy": 0.3674654960632324, + "num_tokens": 1552776176.0, + "step": 3037 + }, + { + "epoch": 0.8215251487290427, + "grad_norm": 3.921875, + "learning_rate": 0.01916273777837225, + "loss": 3.4124, + "mean_token_accuracy": 0.352752685546875, + "num_tokens": 1553300391.0, + "step": 3038 + }, + { + "epoch": 0.8217955651703623, + "grad_norm": 2.515625, + "learning_rate": 0.0191620736463531, + "loss": 3.4233, + "mean_token_accuracy": 0.3665319085121155, + "num_tokens": 1553824543.0, + "step": 3039 + }, + { + "epoch": 0.822065981611682, + "grad_norm": 2.828125, + "learning_rate": 0.019161409263897523, + "loss": 3.2768, + "mean_token_accuracy": 0.3691909909248352, + "num_tokens": 1554348750.0, + "step": 3040 + }, + { + "epoch": 0.8223363980530016, + "grad_norm": 2.21875, + "learning_rate": 0.01916074463102591, + "loss": 3.4884, + "mean_token_accuracy": 0.3623095750808716, + "num_tokens": 1554873034.0, + "step": 3041 + }, + { + "epoch": 0.8226068144943213, + "grad_norm": 3.8125, + "learning_rate": 0.019160079747758654, + "loss": 3.5058, + "mean_token_accuracy": 0.35575124621391296, + "num_tokens": 1555397220.0, + "step": 3042 + }, + { + "epoch": 0.8228772309356409, + "grad_norm": 2.46875, + "learning_rate": 0.01915941461411615, + "loss": 3.27, + "mean_token_accuracy": 0.4050061106681824, + "num_tokens": 1555921332.0, + "step": 3043 + }, + { + "epoch": 0.8231476473769606, + "grad_norm": 3.015625, + "learning_rate": 0.019158749230118812, + "loss": 3.3897, + "mean_token_accuracy": 0.36567163467407227, + "num_tokens": 1556445602.0, + "step": 3044 + }, + { + "epoch": 0.8234180638182802, + "grad_norm": 2.5, + "learning_rate": 0.01915808359578705, + "loss": 3.2218, + "mean_token_accuracy": 0.38669949769973755, + "num_tokens": 1556963950.0, + "step": 3045 + }, + { + "epoch": 0.8236884802595997, + "grad_norm": 3.671875, + "learning_rate": 0.019157417711141298, + "loss": 3.3798, + "mean_token_accuracy": 0.3318042457103729, + "num_tokens": 1557444200.0, + "step": 3046 + }, + { + "epoch": 0.8239588967009194, + "grad_norm": 2.828125, + "learning_rate": 0.01915675157620198, + "loss": 3.4295, + "mean_token_accuracy": 0.38693761825561523, + "num_tokens": 1557910475.0, + "step": 3047 + }, + { + "epoch": 0.824229313142239, + "grad_norm": 3.140625, + "learning_rate": 0.01915608519098954, + "loss": 3.2868, + "mean_token_accuracy": 0.3823813498020172, + "num_tokens": 1558434728.0, + "step": 3048 + }, + { + "epoch": 0.8244997295835587, + "grad_norm": 3.03125, + "learning_rate": 0.01915541855552441, + "loss": 3.5745, + "mean_token_accuracy": 0.32581883668899536, + "num_tokens": 1558958908.0, + "step": 3049 + }, + { + "epoch": 0.8247701460248783, + "grad_norm": 2.828125, + "learning_rate": 0.019154751669827063, + "loss": 3.5012, + "mean_token_accuracy": 0.3587404489517212, + "num_tokens": 1559483128.0, + "step": 3050 + }, + { + "epoch": 0.825040562466198, + "grad_norm": 1.9140625, + "learning_rate": 0.019154084533917953, + "loss": 9.9107, + "mean_token_accuracy": 0.0064262161031365395, + "num_tokens": 1559950800.0, + "step": 3051 + }, + { + "epoch": 0.8253109789075176, + "grad_norm": 6.3125, + "learning_rate": 0.019153417147817552, + "loss": 4.0653, + "mean_token_accuracy": 0.2764277458190918, + "num_tokens": 1560420668.0, + "step": 3052 + }, + { + "epoch": 0.8255813953488372, + "grad_norm": 9.3125, + "learning_rate": 0.019152749511546333, + "loss": 3.445, + "mean_token_accuracy": 0.3991468548774719, + "num_tokens": 1560855941.0, + "step": 3053 + }, + { + "epoch": 0.8258518117901569, + "grad_norm": 2.609375, + "learning_rate": 0.01915208162512478, + "loss": 3.4539, + "mean_token_accuracy": 0.3628990054130554, + "num_tokens": 1561342373.0, + "step": 3054 + }, + { + "epoch": 0.8261222282314765, + "grad_norm": 3.015625, + "learning_rate": 0.01915141348857339, + "loss": 3.7651, + "mean_token_accuracy": 0.3329852819442749, + "num_tokens": 1561866647.0, + "step": 3055 + }, + { + "epoch": 0.826392644672796, + "grad_norm": 4.25, + "learning_rate": 0.019150745101912666, + "loss": 3.6698, + "mean_token_accuracy": 0.33081692457199097, + "num_tokens": 1562390927.0, + "step": 3056 + }, + { + "epoch": 0.8266630611141157, + "grad_norm": 2.90625, + "learning_rate": 0.019150076465163108, + "loss": 3.5421, + "mean_token_accuracy": 0.34893369674682617, + "num_tokens": 1562915110.0, + "step": 3057 + }, + { + "epoch": 0.8269334775554353, + "grad_norm": 2.765625, + "learning_rate": 0.01914940757834524, + "loss": 3.5996, + "mean_token_accuracy": 0.337335467338562, + "num_tokens": 1563435537.0, + "step": 3058 + }, + { + "epoch": 0.827203893996755, + "grad_norm": 3.1875, + "learning_rate": 0.01914873844147958, + "loss": 3.4599, + "mean_token_accuracy": 0.3778139650821686, + "num_tokens": 1563959790.0, + "step": 3059 + }, + { + "epoch": 0.8274743104380746, + "grad_norm": 3.03125, + "learning_rate": 0.019148069054586662, + "loss": 3.4257, + "mean_token_accuracy": 0.3228936195373535, + "num_tokens": 1564484060.0, + "step": 3060 + }, + { + "epoch": 0.8277447268793943, + "grad_norm": 2.96875, + "learning_rate": 0.019147399417687023, + "loss": 3.4978, + "mean_token_accuracy": 0.37072670459747314, + "num_tokens": 1565008305.0, + "step": 3061 + }, + { + "epoch": 0.8280151433207139, + "grad_norm": 3.734375, + "learning_rate": 0.019146729530801206, + "loss": 3.5191, + "mean_token_accuracy": 0.35145455598831177, + "num_tokens": 1565524004.0, + "step": 3062 + }, + { + "epoch": 0.8282855597620336, + "grad_norm": 2.984375, + "learning_rate": 0.019146059393949776, + "loss": 3.488, + "mean_token_accuracy": 0.3579902648925781, + "num_tokens": 1566048096.0, + "step": 3063 + }, + { + "epoch": 0.8285559762033532, + "grad_norm": 2.34375, + "learning_rate": 0.019145389007153284, + "loss": 3.2591, + "mean_token_accuracy": 0.36457741260528564, + "num_tokens": 1566572371.0, + "step": 3064 + }, + { + "epoch": 0.8288263926446728, + "grad_norm": 2.9375, + "learning_rate": 0.019144718370432304, + "loss": 3.2817, + "mean_token_accuracy": 0.3823373317718506, + "num_tokens": 1567057679.0, + "step": 3065 + }, + { + "epoch": 0.8290968090859925, + "grad_norm": 2.59375, + "learning_rate": 0.019144047483807408, + "loss": 3.3775, + "mean_token_accuracy": 0.365894079208374, + "num_tokens": 1567581819.0, + "step": 3066 + }, + { + "epoch": 0.829367225527312, + "grad_norm": 2.890625, + "learning_rate": 0.01914337634729919, + "loss": 3.2644, + "mean_token_accuracy": 0.3914370834827423, + "num_tokens": 1568096218.0, + "step": 3067 + }, + { + "epoch": 0.8296376419686317, + "grad_norm": 2.875, + "learning_rate": 0.019142704960928236, + "loss": 3.2714, + "mean_token_accuracy": 0.3854399025440216, + "num_tokens": 1568620455.0, + "step": 3068 + }, + { + "epoch": 0.8299080584099513, + "grad_norm": 3.28125, + "learning_rate": 0.01914203332471515, + "loss": 3.5946, + "mean_token_accuracy": 0.3771442174911499, + "num_tokens": 1569144586.0, + "step": 3069 + }, + { + "epoch": 0.8301784748512709, + "grad_norm": 3.0625, + "learning_rate": 0.019141361438680535, + "loss": 3.4971, + "mean_token_accuracy": 0.3619561195373535, + "num_tokens": 1569655484.0, + "step": 3070 + }, + { + "epoch": 0.8304488912925906, + "grad_norm": 0.44140625, + "learning_rate": 0.019140689302845012, + "loss": 11.1232, + "mean_token_accuracy": 2.0748073438880965e-05, + "num_tokens": 1570127846.0, + "step": 3071 + }, + { + "epoch": 0.8307193077339102, + "grad_norm": 8.5, + "learning_rate": 0.019140016917229195, + "loss": 4.0983, + "mean_token_accuracy": 0.32889223098754883, + "num_tokens": 1570652124.0, + "step": 3072 + }, + { + "epoch": 0.8309897241752299, + "grad_norm": 3.234375, + "learning_rate": 0.019139344281853727, + "loss": 3.2992, + "mean_token_accuracy": 0.3789193630218506, + "num_tokens": 1571156053.0, + "step": 3073 + }, + { + "epoch": 0.8312601406165495, + "grad_norm": 3.0625, + "learning_rate": 0.019138671396739238, + "loss": 3.7697, + "mean_token_accuracy": 0.33496299386024475, + "num_tokens": 1571680323.0, + "step": 3074 + }, + { + "epoch": 0.8315305570578692, + "grad_norm": 3.375, + "learning_rate": 0.01913799826190637, + "loss": 3.43, + "mean_token_accuracy": 0.3513968288898468, + "num_tokens": 1572204574.0, + "step": 3075 + }, + { + "epoch": 0.8318009734991888, + "grad_norm": 4.1875, + "learning_rate": 0.01913732487737579, + "loss": 3.563, + "mean_token_accuracy": 0.33720242977142334, + "num_tokens": 1572728853.0, + "step": 3076 + }, + { + "epoch": 0.8320713899405083, + "grad_norm": 3.234375, + "learning_rate": 0.019136651243168153, + "loss": 3.562, + "mean_token_accuracy": 0.33821314573287964, + "num_tokens": 1573216999.0, + "step": 3077 + }, + { + "epoch": 0.832341806381828, + "grad_norm": 3.140625, + "learning_rate": 0.019135977359304127, + "loss": 3.5177, + "mean_token_accuracy": 0.3556601107120514, + "num_tokens": 1573694449.0, + "step": 3078 + }, + { + "epoch": 0.8326122228231476, + "grad_norm": 3.109375, + "learning_rate": 0.019135303225804385, + "loss": 3.387, + "mean_token_accuracy": 0.36824411153793335, + "num_tokens": 1574218676.0, + "step": 3079 + }, + { + "epoch": 0.8328826392644673, + "grad_norm": 3.421875, + "learning_rate": 0.01913462884268962, + "loss": 3.504, + "mean_token_accuracy": 0.36360424757003784, + "num_tokens": 1574661091.0, + "step": 3080 + }, + { + "epoch": 0.8331530557057869, + "grad_norm": 4.53125, + "learning_rate": 0.019133954209980517, + "loss": 3.831, + "mean_token_accuracy": 0.3220997154712677, + "num_tokens": 1575185209.0, + "step": 3081 + }, + { + "epoch": 0.8334234721471065, + "grad_norm": 3.1875, + "learning_rate": 0.01913327932769778, + "loss": 3.5836, + "mean_token_accuracy": 0.35167473554611206, + "num_tokens": 1575709479.0, + "step": 3082 + }, + { + "epoch": 0.8336938885884262, + "grad_norm": 3.828125, + "learning_rate": 0.019132604195862116, + "loss": 3.2459, + "mean_token_accuracy": 0.35420703887939453, + "num_tokens": 1576233695.0, + "step": 3083 + }, + { + "epoch": 0.8339643050297458, + "grad_norm": 11.75, + "learning_rate": 0.01913192881449424, + "loss": 3.3154, + "mean_token_accuracy": 0.346769243478775, + "num_tokens": 1576757880.0, + "step": 3084 + }, + { + "epoch": 0.8342347214710655, + "grad_norm": 2.21875, + "learning_rate": 0.019131253183614867, + "loss": 3.5614, + "mean_token_accuracy": 0.3525521457195282, + "num_tokens": 1577282072.0, + "step": 3085 + }, + { + "epoch": 0.8345051379123851, + "grad_norm": 3.796875, + "learning_rate": 0.01913057730324474, + "loss": 3.3439, + "mean_token_accuracy": 0.36243700981140137, + "num_tokens": 1577806256.0, + "step": 3086 + }, + { + "epoch": 0.8347755543537047, + "grad_norm": 4.53125, + "learning_rate": 0.01912990117340459, + "loss": 3.7438, + "mean_token_accuracy": 0.349537193775177, + "num_tokens": 1578326218.0, + "step": 3087 + }, + { + "epoch": 0.8350459707950243, + "grad_norm": 3.328125, + "learning_rate": 0.01912922479411516, + "loss": 3.6145, + "mean_token_accuracy": 0.3557639718055725, + "num_tokens": 1578850485.0, + "step": 3088 + }, + { + "epoch": 0.8353163872363439, + "grad_norm": 3.90625, + "learning_rate": 0.01912854816539721, + "loss": 3.4751, + "mean_token_accuracy": 0.3660482168197632, + "num_tokens": 1579374646.0, + "step": 3089 + }, + { + "epoch": 0.8355868036776636, + "grad_norm": 3.09375, + "learning_rate": 0.019127871287271496, + "loss": 3.6657, + "mean_token_accuracy": 0.348450243473053, + "num_tokens": 1579898852.0, + "step": 3090 + }, + { + "epoch": 0.8358572201189832, + "grad_norm": 2.09375, + "learning_rate": 0.01912719415975879, + "loss": 9.9471, + "mean_token_accuracy": 0.007404782343655825, + "num_tokens": 1580381904.0, + "step": 3091 + }, + { + "epoch": 0.8361276365603029, + "grad_norm": 7.5, + "learning_rate": 0.019126516782879865, + "loss": 4.0941, + "mean_token_accuracy": 0.31087133288383484, + "num_tokens": 1580858910.0, + "step": 3092 + }, + { + "epoch": 0.8363980530016225, + "grad_norm": 2.3125, + "learning_rate": 0.019125839156655507, + "loss": 3.4122, + "mean_token_accuracy": 0.37143027782440186, + "num_tokens": 1581383031.0, + "step": 3093 + }, + { + "epoch": 0.8366684694429422, + "grad_norm": 3.859375, + "learning_rate": 0.019125161281106508, + "loss": 3.5816, + "mean_token_accuracy": 0.3622315526008606, + "num_tokens": 1581907305.0, + "step": 3094 + }, + { + "epoch": 0.8369388858842618, + "grad_norm": 3.328125, + "learning_rate": 0.019124483156253667, + "loss": 3.5139, + "mean_token_accuracy": 0.34909653663635254, + "num_tokens": 1582431535.0, + "step": 3095 + }, + { + "epoch": 0.8372093023255814, + "grad_norm": 3.390625, + "learning_rate": 0.01912380478211779, + "loss": 3.3826, + "mean_token_accuracy": 0.3957327604293823, + "num_tokens": 1582955743.0, + "step": 3096 + }, + { + "epoch": 0.8374797187669011, + "grad_norm": 3.703125, + "learning_rate": 0.01912312615871969, + "loss": 3.5702, + "mean_token_accuracy": 0.339625746011734, + "num_tokens": 1583479955.0, + "step": 3097 + }, + { + "epoch": 0.8377501352082206, + "grad_norm": 2.40625, + "learning_rate": 0.019122447286080188, + "loss": 3.4437, + "mean_token_accuracy": 0.386508584022522, + "num_tokens": 1583946009.0, + "step": 3098 + }, + { + "epoch": 0.8380205516495403, + "grad_norm": 3.546875, + "learning_rate": 0.01912176816422012, + "loss": 3.2794, + "mean_token_accuracy": 0.368452250957489, + "num_tokens": 1584470171.0, + "step": 3099 + }, + { + "epoch": 0.8382909680908599, + "grad_norm": 2.78125, + "learning_rate": 0.019121088793160324, + "loss": 3.2849, + "mean_token_accuracy": 0.37369439005851746, + "num_tokens": 1584978992.0, + "step": 3100 + }, + { + "epoch": 0.8385613845321795, + "grad_norm": 2.84375, + "learning_rate": 0.019120409172921637, + "loss": 3.4339, + "mean_token_accuracy": 0.3688649535179138, + "num_tokens": 1585503080.0, + "step": 3101 + }, + { + "epoch": 0.8388318009734992, + "grad_norm": 3.28125, + "learning_rate": 0.01911972930352492, + "loss": 3.3964, + "mean_token_accuracy": 0.36550214886665344, + "num_tokens": 1585929687.0, + "step": 3102 + }, + { + "epoch": 0.8391022174148188, + "grad_norm": 2.96875, + "learning_rate": 0.019119049184991025, + "loss": 3.4917, + "mean_token_accuracy": 0.36686402559280396, + "num_tokens": 1586453821.0, + "step": 3103 + }, + { + "epoch": 0.8393726338561385, + "grad_norm": 3.078125, + "learning_rate": 0.019118368817340827, + "loss": 3.3509, + "mean_token_accuracy": 0.34797102212905884, + "num_tokens": 1586978086.0, + "step": 3104 + }, + { + "epoch": 0.8396430502974581, + "grad_norm": 2.78125, + "learning_rate": 0.019117688200595194, + "loss": 3.1353, + "mean_token_accuracy": 0.3835158348083496, + "num_tokens": 1587502257.0, + "step": 3105 + }, + { + "epoch": 0.8399134667387778, + "grad_norm": 3.0, + "learning_rate": 0.01911700733477502, + "loss": 3.3339, + "mean_token_accuracy": 0.3656023144721985, + "num_tokens": 1588026514.0, + "step": 3106 + }, + { + "epoch": 0.8401838831800974, + "grad_norm": 2.6875, + "learning_rate": 0.019116326219901187, + "loss": 3.3269, + "mean_token_accuracy": 0.3785386085510254, + "num_tokens": 1588550703.0, + "step": 3107 + }, + { + "epoch": 0.8404542996214169, + "grad_norm": 3.3125, + "learning_rate": 0.0191156448559946, + "loss": 3.4245, + "mean_token_accuracy": 0.3734245002269745, + "num_tokens": 1589074878.0, + "step": 3108 + }, + { + "epoch": 0.8407247160627366, + "grad_norm": 3.0625, + "learning_rate": 0.019114963243076162, + "loss": 3.3939, + "mean_token_accuracy": 0.3662852644920349, + "num_tokens": 1589559314.0, + "step": 3109 + }, + { + "epoch": 0.8409951325040562, + "grad_norm": 2.921875, + "learning_rate": 0.019114281381166786, + "loss": 3.1089, + "mean_token_accuracy": 0.3955445885658264, + "num_tokens": 1590083534.0, + "step": 3110 + }, + { + "epoch": 0.8412655489453759, + "grad_norm": 30.0, + "learning_rate": 0.019113599270287395, + "loss": 12.8145, + "mean_token_accuracy": 0.014957190491259098, + "num_tokens": 1590607705.0, + "step": 3111 + }, + { + "epoch": 0.8415359653866955, + "grad_norm": 7.78125, + "learning_rate": 0.019112916910458917, + "loss": 4.0677, + "mean_token_accuracy": 0.32875049114227295, + "num_tokens": 1591087369.0, + "step": 3112 + }, + { + "epoch": 0.8418063818280151, + "grad_norm": 3.40625, + "learning_rate": 0.01911223430170229, + "loss": 3.504, + "mean_token_accuracy": 0.33602628111839294, + "num_tokens": 1591611613.0, + "step": 3113 + }, + { + "epoch": 0.8420767982693348, + "grad_norm": 2.96875, + "learning_rate": 0.019111551444038456, + "loss": 3.4228, + "mean_token_accuracy": 0.36777758598327637, + "num_tokens": 1592114730.0, + "step": 3114 + }, + { + "epoch": 0.8423472147106544, + "grad_norm": 2.734375, + "learning_rate": 0.019110868337488374, + "loss": 3.3248, + "mean_token_accuracy": 0.36386799812316895, + "num_tokens": 1592638962.0, + "step": 3115 + }, + { + "epoch": 0.8426176311519741, + "grad_norm": 3.09375, + "learning_rate": 0.019110184982072995, + "loss": 3.3807, + "mean_token_accuracy": 0.3582162857055664, + "num_tokens": 1593163133.0, + "step": 3116 + }, + { + "epoch": 0.8428880475932937, + "grad_norm": 2.078125, + "learning_rate": 0.019109501377813295, + "loss": 3.3563, + "mean_token_accuracy": 0.38427239656448364, + "num_tokens": 1593658382.0, + "step": 3117 + }, + { + "epoch": 0.8431584640346133, + "grad_norm": 3.265625, + "learning_rate": 0.01910881752473024, + "loss": 3.5514, + "mean_token_accuracy": 0.3493974208831787, + "num_tokens": 1594182497.0, + "step": 3118 + }, + { + "epoch": 0.8434288804759329, + "grad_norm": 4.78125, + "learning_rate": 0.019108133422844814, + "loss": 3.6938, + "mean_token_accuracy": 0.36399221420288086, + "num_tokens": 1594706779.0, + "step": 3119 + }, + { + "epoch": 0.8436992969172525, + "grad_norm": 4.09375, + "learning_rate": 0.01910744907217802, + "loss": 3.4919, + "mean_token_accuracy": 0.34698250889778137, + "num_tokens": 1595230998.0, + "step": 3120 + }, + { + "epoch": 0.8439697133585722, + "grad_norm": 2.484375, + "learning_rate": 0.019106764472750837, + "loss": 3.1008, + "mean_token_accuracy": 0.39792290329933167, + "num_tokens": 1595755130.0, + "step": 3121 + }, + { + "epoch": 0.8442401297998918, + "grad_norm": 3.40625, + "learning_rate": 0.01910607962458428, + "loss": 3.2944, + "mean_token_accuracy": 0.36595508456230164, + "num_tokens": 1596279353.0, + "step": 3122 + }, + { + "epoch": 0.8445105462412115, + "grad_norm": 2.1875, + "learning_rate": 0.019105394527699368, + "loss": 3.2976, + "mean_token_accuracy": 0.3840698301792145, + "num_tokens": 1596803331.0, + "step": 3123 + }, + { + "epoch": 0.8447809626825311, + "grad_norm": 3.34375, + "learning_rate": 0.019104709182117107, + "loss": 3.5463, + "mean_token_accuracy": 0.3203805088996887, + "num_tokens": 1597327573.0, + "step": 3124 + }, + { + "epoch": 0.8450513791238508, + "grad_norm": 2.59375, + "learning_rate": 0.01910402358785854, + "loss": 3.1701, + "mean_token_accuracy": 0.3850052058696747, + "num_tokens": 1597851849.0, + "step": 3125 + }, + { + "epoch": 0.8453217955651704, + "grad_norm": 3.828125, + "learning_rate": 0.019103337744944694, + "loss": 3.1963, + "mean_token_accuracy": 0.3782919645309448, + "num_tokens": 1598376119.0, + "step": 3126 + }, + { + "epoch": 0.84559221200649, + "grad_norm": 2.203125, + "learning_rate": 0.01910265165339662, + "loss": 3.1464, + "mean_token_accuracy": 0.3867056667804718, + "num_tokens": 1598900240.0, + "step": 3127 + }, + { + "epoch": 0.8458626284478096, + "grad_norm": 3.359375, + "learning_rate": 0.01910196531323536, + "loss": 3.5025, + "mean_token_accuracy": 0.3529936969280243, + "num_tokens": 1599424492.0, + "step": 3128 + }, + { + "epoch": 0.8461330448891292, + "grad_norm": 2.765625, + "learning_rate": 0.01910127872448198, + "loss": 3.5392, + "mean_token_accuracy": 0.367154598236084, + "num_tokens": 1599948593.0, + "step": 3129 + }, + { + "epoch": 0.8464034613304489, + "grad_norm": 4.0625, + "learning_rate": 0.019100591887157545, + "loss": 3.3506, + "mean_token_accuracy": 0.3703867495059967, + "num_tokens": 1600472845.0, + "step": 3130 + }, + { + "epoch": 0.8466738777717685, + "grad_norm": 424.0, + "learning_rate": 0.01909990480128313, + "loss": 17.577, + "mean_token_accuracy": 0.0034797966945916414, + "num_tokens": 1600997108.0, + "step": 3131 + }, + { + "epoch": 0.8469442942130881, + "grad_norm": 11.25, + "learning_rate": 0.01909921746687981, + "loss": 3.7004, + "mean_token_accuracy": 0.3318939805030823, + "num_tokens": 1601521346.0, + "step": 3132 + }, + { + "epoch": 0.8472147106544078, + "grad_norm": 4.28125, + "learning_rate": 0.019098529883968686, + "loss": 3.5599, + "mean_token_accuracy": 0.3910645842552185, + "num_tokens": 1601985140.0, + "step": 3133 + }, + { + "epoch": 0.8474851270957274, + "grad_norm": 2.859375, + "learning_rate": 0.019097842052570847, + "loss": 3.7733, + "mean_token_accuracy": 0.35115718841552734, + "num_tokens": 1602495773.0, + "step": 3134 + }, + { + "epoch": 0.8477555435370471, + "grad_norm": 3.6875, + "learning_rate": 0.019097153972707396, + "loss": 3.5764, + "mean_token_accuracy": 0.3482271134853363, + "num_tokens": 1603008651.0, + "step": 3135 + }, + { + "epoch": 0.8480259599783667, + "grad_norm": 2.5625, + "learning_rate": 0.019096465644399454, + "loss": 3.5872, + "mean_token_accuracy": 0.3629436492919922, + "num_tokens": 1603501274.0, + "step": 3136 + }, + { + "epoch": 0.8482963764196864, + "grad_norm": 2.8125, + "learning_rate": 0.019095777067668133, + "loss": 3.534, + "mean_token_accuracy": 0.35117754340171814, + "num_tokens": 1604025501.0, + "step": 3137 + }, + { + "epoch": 0.848566792861006, + "grad_norm": 3.203125, + "learning_rate": 0.01909508824253456, + "loss": 3.5909, + "mean_token_accuracy": 0.3546387553215027, + "num_tokens": 1604549703.0, + "step": 3138 + }, + { + "epoch": 0.8488372093023255, + "grad_norm": 3.703125, + "learning_rate": 0.019094399169019883, + "loss": 3.4302, + "mean_token_accuracy": 0.3519982099533081, + "num_tokens": 1605073851.0, + "step": 3139 + }, + { + "epoch": 0.8491076257436452, + "grad_norm": 2.765625, + "learning_rate": 0.01909370984714523, + "loss": 3.4265, + "mean_token_accuracy": 0.3875112235546112, + "num_tokens": 1605567844.0, + "step": 3140 + }, + { + "epoch": 0.8493780421849648, + "grad_norm": 3.15625, + "learning_rate": 0.019093020276931758, + "loss": 3.2987, + "mean_token_accuracy": 0.39286166429519653, + "num_tokens": 1606031663.0, + "step": 3141 + }, + { + "epoch": 0.8496484586262845, + "grad_norm": 2.46875, + "learning_rate": 0.01909233045840062, + "loss": 3.2763, + "mean_token_accuracy": 0.3853089213371277, + "num_tokens": 1606530531.0, + "step": 3142 + }, + { + "epoch": 0.8499188750676041, + "grad_norm": 2.671875, + "learning_rate": 0.019091640391572986, + "loss": 3.4856, + "mean_token_accuracy": 0.3958980143070221, + "num_tokens": 1606916016.0, + "step": 3143 + }, + { + "epoch": 0.8501892915089238, + "grad_norm": 4.75, + "learning_rate": 0.019090950076470034, + "loss": 2.9621, + "mean_token_accuracy": 0.4431612491607666, + "num_tokens": 1607440159.0, + "step": 3144 + }, + { + "epoch": 0.8504597079502434, + "grad_norm": 2.5625, + "learning_rate": 0.019090259513112936, + "loss": 3.4322, + "mean_token_accuracy": 0.35623300075531006, + "num_tokens": 1607964369.0, + "step": 3145 + }, + { + "epoch": 0.850730124391563, + "grad_norm": 3.953125, + "learning_rate": 0.019089568701522886, + "loss": 3.4397, + "mean_token_accuracy": 0.3590511381626129, + "num_tokens": 1608488638.0, + "step": 3146 + }, + { + "epoch": 0.8510005408328827, + "grad_norm": 2.5625, + "learning_rate": 0.019088877641721076, + "loss": 3.4141, + "mean_token_accuracy": 0.37950778007507324, + "num_tokens": 1609012849.0, + "step": 3147 + }, + { + "epoch": 0.8512709572742023, + "grad_norm": 3.03125, + "learning_rate": 0.019088186333728715, + "loss": 3.2502, + "mean_token_accuracy": 0.3883429169654846, + "num_tokens": 1609505177.0, + "step": 3148 + }, + { + "epoch": 0.8515413737155219, + "grad_norm": 2.578125, + "learning_rate": 0.01908749477756701, + "loss": 3.2996, + "mean_token_accuracy": 0.38612252473831177, + "num_tokens": 1610029397.0, + "step": 3149 + }, + { + "epoch": 0.8518117901568415, + "grad_norm": 3.6875, + "learning_rate": 0.019086802973257183, + "loss": 3.3387, + "mean_token_accuracy": 0.3582899570465088, + "num_tokens": 1610553598.0, + "step": 3150 + }, + { + "epoch": 0.8520822065981611, + "grad_norm": 288.0, + "learning_rate": 0.019086110920820458, + "loss": 27.3742, + "mean_token_accuracy": 4.509559221332893e-05, + "num_tokens": 1611077864.0, + "step": 3151 + }, + { + "epoch": 0.8523526230394808, + "grad_norm": 6.59375, + "learning_rate": 0.01908541862027807, + "loss": 3.9205, + "mean_token_accuracy": 0.3218485116958618, + "num_tokens": 1611602005.0, + "step": 3152 + }, + { + "epoch": 0.8526230394808004, + "grad_norm": 2.390625, + "learning_rate": 0.019084726071651258, + "loss": 3.5836, + "mean_token_accuracy": 0.3786291480064392, + "num_tokens": 1612055205.0, + "step": 3153 + }, + { + "epoch": 0.8528934559221201, + "grad_norm": 3.203125, + "learning_rate": 0.019084033274961276, + "loss": 3.5894, + "mean_token_accuracy": 0.32837924361228943, + "num_tokens": 1612579421.0, + "step": 3154 + }, + { + "epoch": 0.8531638723634397, + "grad_norm": 2.8125, + "learning_rate": 0.01908334023022938, + "loss": 3.6185, + "mean_token_accuracy": 0.3686201572418213, + "num_tokens": 1613103699.0, + "step": 3155 + }, + { + "epoch": 0.8534342888047594, + "grad_norm": 3.21875, + "learning_rate": 0.01908264693747684, + "loss": 3.6272, + "mean_token_accuracy": 0.3401775360107422, + "num_tokens": 1613627755.0, + "step": 3156 + }, + { + "epoch": 0.853704705246079, + "grad_norm": 2.828125, + "learning_rate": 0.019081953396724912, + "loss": 3.2985, + "mean_token_accuracy": 0.3752036690711975, + "num_tokens": 1614151962.0, + "step": 3157 + }, + { + "epoch": 0.8539751216873986, + "grad_norm": 11.25, + "learning_rate": 0.019081259607994897, + "loss": 3.4172, + "mean_token_accuracy": 0.38942989706993103, + "num_tokens": 1614676196.0, + "step": 3158 + }, + { + "epoch": 0.8542455381287182, + "grad_norm": 2.390625, + "learning_rate": 0.019080565571308064, + "loss": 3.3418, + "mean_token_accuracy": 0.37223413586616516, + "num_tokens": 1615200374.0, + "step": 3159 + }, + { + "epoch": 0.8545159545700378, + "grad_norm": 2.265625, + "learning_rate": 0.01907987128668572, + "loss": 3.0213, + "mean_token_accuracy": 0.3878306448459625, + "num_tokens": 1615724599.0, + "step": 3160 + }, + { + "epoch": 0.8547863710113575, + "grad_norm": 3.640625, + "learning_rate": 0.01907917675414916, + "loss": 3.2996, + "mean_token_accuracy": 0.36564528942108154, + "num_tokens": 1616248798.0, + "step": 3161 + }, + { + "epoch": 0.8550567874526771, + "grad_norm": 3.59375, + "learning_rate": 0.0190784819737197, + "loss": 3.4533, + "mean_token_accuracy": 0.3434180021286011, + "num_tokens": 1616773062.0, + "step": 3162 + }, + { + "epoch": 0.8553272038939967, + "grad_norm": 2.421875, + "learning_rate": 0.01907778694541866, + "loss": 3.1889, + "mean_token_accuracy": 0.38083815574645996, + "num_tokens": 1617297300.0, + "step": 3163 + }, + { + "epoch": 0.8555976203353164, + "grad_norm": 3.203125, + "learning_rate": 0.019077091669267354, + "loss": 3.1746, + "mean_token_accuracy": 0.36545199155807495, + "num_tokens": 1617821463.0, + "step": 3164 + }, + { + "epoch": 0.855868036776636, + "grad_norm": 2.484375, + "learning_rate": 0.019076396145287125, + "loss": 3.2152, + "mean_token_accuracy": 0.3858686685562134, + "num_tokens": 1618345493.0, + "step": 3165 + }, + { + "epoch": 0.8561384532179557, + "grad_norm": 3.171875, + "learning_rate": 0.019075700373499312, + "loss": 3.597, + "mean_token_accuracy": 0.3592401444911957, + "num_tokens": 1618869729.0, + "step": 3166 + }, + { + "epoch": 0.8564088696592753, + "grad_norm": 2.96875, + "learning_rate": 0.019075004353925264, + "loss": 3.3563, + "mean_token_accuracy": 0.3804820775985718, + "num_tokens": 1619393698.0, + "step": 3167 + }, + { + "epoch": 0.856679286100595, + "grad_norm": 3.3125, + "learning_rate": 0.01907430808658633, + "loss": 3.3565, + "mean_token_accuracy": 0.3823234736919403, + "num_tokens": 1619917801.0, + "step": 3168 + }, + { + "epoch": 0.8569497025419145, + "grad_norm": 3.6875, + "learning_rate": 0.019073611571503883, + "loss": 3.4883, + "mean_token_accuracy": 0.37102147936820984, + "num_tokens": 1620442085.0, + "step": 3169 + }, + { + "epoch": 0.8572201189832341, + "grad_norm": 3.46875, + "learning_rate": 0.019072914808699288, + "loss": 3.5483, + "mean_token_accuracy": 0.37580713629722595, + "num_tokens": 1620966224.0, + "step": 3170 + }, + { + "epoch": 0.8574905354245538, + "grad_norm": 99.0, + "learning_rate": 0.019072217798193927, + "loss": 13.4255, + "mean_token_accuracy": 0.0, + "num_tokens": 1621490309.0, + "step": 3171 + }, + { + "epoch": 0.8577609518658734, + "grad_norm": 8.4375, + "learning_rate": 0.019071520540009185, + "loss": 3.831, + "mean_token_accuracy": 0.3229196071624756, + "num_tokens": 1621977026.0, + "step": 3172 + }, + { + "epoch": 0.8580313683071931, + "grad_norm": 2.703125, + "learning_rate": 0.01907082303416646, + "loss": 3.4366, + "mean_token_accuracy": 0.352138876914978, + "num_tokens": 1622501226.0, + "step": 3173 + }, + { + "epoch": 0.8583017847485127, + "grad_norm": 3.21875, + "learning_rate": 0.019070125280687144, + "loss": 3.721, + "mean_token_accuracy": 0.34987515211105347, + "num_tokens": 1622979223.0, + "step": 3174 + }, + { + "epoch": 0.8585722011898324, + "grad_norm": 2.59375, + "learning_rate": 0.01906942727959265, + "loss": 3.4513, + "mean_token_accuracy": 0.3659553527832031, + "num_tokens": 1623503468.0, + "step": 3175 + }, + { + "epoch": 0.858842617631152, + "grad_norm": 2.625, + "learning_rate": 0.019068729030904404, + "loss": 3.4018, + "mean_token_accuracy": 0.37282299995422363, + "num_tokens": 1624027712.0, + "step": 3176 + }, + { + "epoch": 0.8591130340724716, + "grad_norm": 2.875, + "learning_rate": 0.01906803053464382, + "loss": 3.5723, + "mean_token_accuracy": 0.361532598733902, + "num_tokens": 1624546227.0, + "step": 3177 + }, + { + "epoch": 0.8593834505137913, + "grad_norm": 3.125, + "learning_rate": 0.01906733179083233, + "loss": 3.4604, + "mean_token_accuracy": 0.38097721338272095, + "num_tokens": 1625057030.0, + "step": 3178 + }, + { + "epoch": 0.8596538669551109, + "grad_norm": 3.0625, + "learning_rate": 0.019066632799491376, + "loss": 3.4646, + "mean_token_accuracy": 0.37487146258354187, + "num_tokens": 1625581156.0, + "step": 3179 + }, + { + "epoch": 0.8599242833964305, + "grad_norm": 2.828125, + "learning_rate": 0.01906593356064241, + "loss": 3.3648, + "mean_token_accuracy": 0.36605778336524963, + "num_tokens": 1626105420.0, + "step": 3180 + }, + { + "epoch": 0.8601946998377501, + "grad_norm": 3.375, + "learning_rate": 0.019065234074306877, + "loss": 3.3319, + "mean_token_accuracy": 0.39763325452804565, + "num_tokens": 1626627803.0, + "step": 3181 + }, + { + "epoch": 0.8604651162790697, + "grad_norm": 3.609375, + "learning_rate": 0.019064534340506246, + "loss": 3.4319, + "mean_token_accuracy": 0.341166615486145, + "num_tokens": 1627151995.0, + "step": 3182 + }, + { + "epoch": 0.8607355327203894, + "grad_norm": 3.15625, + "learning_rate": 0.019063834359261986, + "loss": 3.2715, + "mean_token_accuracy": 0.373296856880188, + "num_tokens": 1627676201.0, + "step": 3183 + }, + { + "epoch": 0.861005949161709, + "grad_norm": 4.25, + "learning_rate": 0.01906313413059557, + "loss": 3.244, + "mean_token_accuracy": 0.37805449962615967, + "num_tokens": 1628200416.0, + "step": 3184 + }, + { + "epoch": 0.8612763656030287, + "grad_norm": 1.953125, + "learning_rate": 0.01906243365452849, + "loss": 3.2219, + "mean_token_accuracy": 0.38379305601119995, + "num_tokens": 1628679868.0, + "step": 3185 + }, + { + "epoch": 0.8615467820443483, + "grad_norm": 2.953125, + "learning_rate": 0.019061732931082237, + "loss": 3.3148, + "mean_token_accuracy": 0.38031238317489624, + "num_tokens": 1629204139.0, + "step": 3186 + }, + { + "epoch": 0.861817198485668, + "grad_norm": 3.0, + "learning_rate": 0.019061031960278307, + "loss": 3.2851, + "mean_token_accuracy": 0.3738784193992615, + "num_tokens": 1629629630.0, + "step": 3187 + }, + { + "epoch": 0.8620876149269876, + "grad_norm": 2.953125, + "learning_rate": 0.01906033074213821, + "loss": 3.2838, + "mean_token_accuracy": 0.3795976936817169, + "num_tokens": 1630153844.0, + "step": 3188 + }, + { + "epoch": 0.8623580313683072, + "grad_norm": 2.75, + "learning_rate": 0.019059629276683462, + "loss": 3.1008, + "mean_token_accuracy": 0.4013441205024719, + "num_tokens": 1630669262.0, + "step": 3189 + }, + { + "epoch": 0.8626284478096268, + "grad_norm": 2.8125, + "learning_rate": 0.019058927563935592, + "loss": 3.3694, + "mean_token_accuracy": 0.3869315981864929, + "num_tokens": 1631193453.0, + "step": 3190 + }, + { + "epoch": 0.8628988642509464, + "grad_norm": 78.0, + "learning_rate": 0.019058225603916117, + "loss": 11.3557, + "mean_token_accuracy": 0.0306050144135952, + "num_tokens": 1631717465.0, + "step": 3191 + }, + { + "epoch": 0.8631692806922661, + "grad_norm": 6.3125, + "learning_rate": 0.019057523396646587, + "loss": 4.0329, + "mean_token_accuracy": 0.26907211542129517, + "num_tokens": 1632241626.0, + "step": 3192 + }, + { + "epoch": 0.8634396971335857, + "grad_norm": 2.5625, + "learning_rate": 0.019056820942148542, + "loss": 3.4731, + "mean_token_accuracy": 0.32724589109420776, + "num_tokens": 1632765590.0, + "step": 3193 + }, + { + "epoch": 0.8637101135749053, + "grad_norm": 2.140625, + "learning_rate": 0.019056118240443537, + "loss": 3.2462, + "mean_token_accuracy": 0.3830568790435791, + "num_tokens": 1633289868.0, + "step": 3194 + }, + { + "epoch": 0.863980530016225, + "grad_norm": 3.5, + "learning_rate": 0.01905541529155313, + "loss": 3.5947, + "mean_token_accuracy": 0.3624061942100525, + "num_tokens": 1633814102.0, + "step": 3195 + }, + { + "epoch": 0.8642509464575446, + "grad_norm": 4.03125, + "learning_rate": 0.019054712095498895, + "loss": 3.4054, + "mean_token_accuracy": 0.3622574210166931, + "num_tokens": 1634338355.0, + "step": 3196 + }, + { + "epoch": 0.8645213628988643, + "grad_norm": 3.21875, + "learning_rate": 0.019054008652302407, + "loss": 3.4661, + "mean_token_accuracy": 0.3660370111465454, + "num_tokens": 1634862603.0, + "step": 3197 + }, + { + "epoch": 0.8647917793401839, + "grad_norm": 3.546875, + "learning_rate": 0.01905330496198525, + "loss": 3.4957, + "mean_token_accuracy": 0.3615279793739319, + "num_tokens": 1635351596.0, + "step": 3198 + }, + { + "epoch": 0.8650621957815036, + "grad_norm": 2.984375, + "learning_rate": 0.01905260102456901, + "loss": 3.3733, + "mean_token_accuracy": 0.3793931007385254, + "num_tokens": 1635833883.0, + "step": 3199 + }, + { + "epoch": 0.8653326122228231, + "grad_norm": 2.59375, + "learning_rate": 0.019051896840075293, + "loss": 3.3407, + "mean_token_accuracy": 0.38554590940475464, + "num_tokens": 1636292422.0, + "step": 3200 + }, + { + "epoch": 0.8656030286641427, + "grad_norm": 3.828125, + "learning_rate": 0.019051192408525694, + "loss": 3.6028, + "mean_token_accuracy": 0.34053653478622437, + "num_tokens": 1636816628.0, + "step": 3201 + }, + { + "epoch": 0.8658734451054624, + "grad_norm": 2.9375, + "learning_rate": 0.01905048772994184, + "loss": 3.4082, + "mean_token_accuracy": 0.3867189884185791, + "num_tokens": 1637268156.0, + "step": 3202 + }, + { + "epoch": 0.866143861546782, + "grad_norm": 3.078125, + "learning_rate": 0.01904978280434535, + "loss": 3.6206, + "mean_token_accuracy": 0.33830726146698, + "num_tokens": 1637792422.0, + "step": 3203 + }, + { + "epoch": 0.8664142779881017, + "grad_norm": 2.484375, + "learning_rate": 0.01904907763175785, + "loss": 3.4264, + "mean_token_accuracy": 0.3624218702316284, + "num_tokens": 1638316707.0, + "step": 3204 + }, + { + "epoch": 0.8666846944294213, + "grad_norm": 2.6875, + "learning_rate": 0.01904837221220097, + "loss": 3.269, + "mean_token_accuracy": 0.37784507870674133, + "num_tokens": 1638783226.0, + "step": 3205 + }, + { + "epoch": 0.866955110870741, + "grad_norm": 2.171875, + "learning_rate": 0.019047666545696368, + "loss": 3.1494, + "mean_token_accuracy": 0.3678613007068634, + "num_tokens": 1639307280.0, + "step": 3206 + }, + { + "epoch": 0.8672255273120606, + "grad_norm": 10.375, + "learning_rate": 0.01904696063226569, + "loss": 3.2296, + "mean_token_accuracy": 0.406503826379776, + "num_tokens": 1639738280.0, + "step": 3207 + }, + { + "epoch": 0.8674959437533802, + "grad_norm": 2.046875, + "learning_rate": 0.019046254471930594, + "loss": 3.3484, + "mean_token_accuracy": 0.37016698718070984, + "num_tokens": 1640262329.0, + "step": 3208 + }, + { + "epoch": 0.8677663601946999, + "grad_norm": 3.125, + "learning_rate": 0.019045548064712748, + "loss": 3.2388, + "mean_token_accuracy": 0.3913159966468811, + "num_tokens": 1640786495.0, + "step": 3209 + }, + { + "epoch": 0.8680367766360195, + "grad_norm": 3.34375, + "learning_rate": 0.01904484141063383, + "loss": 3.2931, + "mean_token_accuracy": 0.3797507882118225, + "num_tokens": 1641251773.0, + "step": 3210 + }, + { + "epoch": 0.8683071930773391, + "grad_norm": 113.5, + "learning_rate": 0.019044134509715514, + "loss": 19.6608, + "mean_token_accuracy": 4.131029709242284e-05, + "num_tokens": 1641775942.0, + "step": 3211 + }, + { + "epoch": 0.8685776095186587, + "grad_norm": 8.0625, + "learning_rate": 0.0190434273619795, + "loss": 3.9888, + "mean_token_accuracy": 0.33117854595184326, + "num_tokens": 1642300165.0, + "step": 3212 + }, + { + "epoch": 0.8688480259599783, + "grad_norm": 2.3125, + "learning_rate": 0.019042719967447476, + "loss": 3.3198, + "mean_token_accuracy": 0.3588266968727112, + "num_tokens": 1642824372.0, + "step": 3213 + }, + { + "epoch": 0.869118442401298, + "grad_norm": 2.5, + "learning_rate": 0.019042012326141156, + "loss": 3.4868, + "mean_token_accuracy": 0.35871008038520813, + "num_tokens": 1643347673.0, + "step": 3214 + }, + { + "epoch": 0.8693888588426176, + "grad_norm": 3.9375, + "learning_rate": 0.019041304438082246, + "loss": 3.4049, + "mean_token_accuracy": 0.3827663064002991, + "num_tokens": 1643860391.0, + "step": 3215 + }, + { + "epoch": 0.8696592752839373, + "grad_norm": 3.328125, + "learning_rate": 0.019040596303292464, + "loss": 3.2784, + "mean_token_accuracy": 0.349422425031662, + "num_tokens": 1644384516.0, + "step": 3216 + }, + { + "epoch": 0.8699296917252569, + "grad_norm": 3.59375, + "learning_rate": 0.019039887921793543, + "loss": 3.2169, + "mean_token_accuracy": 0.35917437076568604, + "num_tokens": 1644908688.0, + "step": 3217 + }, + { + "epoch": 0.8702001081665766, + "grad_norm": 3.1875, + "learning_rate": 0.01903917929360722, + "loss": 3.3587, + "mean_token_accuracy": 0.3674595355987549, + "num_tokens": 1645432845.0, + "step": 3218 + }, + { + "epoch": 0.8704705246078962, + "grad_norm": 3.53125, + "learning_rate": 0.019038470418755234, + "loss": 3.2263, + "mean_token_accuracy": 0.37936824560165405, + "num_tokens": 1645957069.0, + "step": 3219 + }, + { + "epoch": 0.8707409410492158, + "grad_norm": 3.421875, + "learning_rate": 0.019037761297259332, + "loss": 3.2041, + "mean_token_accuracy": 0.3899479806423187, + "num_tokens": 1646474570.0, + "step": 3220 + }, + { + "epoch": 0.8710113574905354, + "grad_norm": 2.046875, + "learning_rate": 0.019037051929141284, + "loss": 3.2845, + "mean_token_accuracy": 0.4015046954154968, + "num_tokens": 1646930689.0, + "step": 3221 + }, + { + "epoch": 0.871281773931855, + "grad_norm": 3.59375, + "learning_rate": 0.01903634231442284, + "loss": 3.5734, + "mean_token_accuracy": 0.3514028489589691, + "num_tokens": 1647454766.0, + "step": 3222 + }, + { + "epoch": 0.8715521903731747, + "grad_norm": 2.734375, + "learning_rate": 0.01903563245312578, + "loss": 3.2617, + "mean_token_accuracy": 0.385519802570343, + "num_tokens": 1647978966.0, + "step": 3223 + }, + { + "epoch": 0.8718226068144943, + "grad_norm": 3.59375, + "learning_rate": 0.01903492234527189, + "loss": 3.3613, + "mean_token_accuracy": 0.37894192337989807, + "num_tokens": 1648503173.0, + "step": 3224 + }, + { + "epoch": 0.872093023255814, + "grad_norm": 2.75, + "learning_rate": 0.01903421199088295, + "loss": 3.4595, + "mean_token_accuracy": 0.3525165021419525, + "num_tokens": 1649027428.0, + "step": 3225 + }, + { + "epoch": 0.8723634396971336, + "grad_norm": 3.609375, + "learning_rate": 0.019033501389980764, + "loss": 3.4569, + "mean_token_accuracy": 0.39446914196014404, + "num_tokens": 1649551574.0, + "step": 3226 + }, + { + "epoch": 0.8726338561384532, + "grad_norm": 2.609375, + "learning_rate": 0.019032790542587122, + "loss": 3.3631, + "mean_token_accuracy": 0.35982614755630493, + "num_tokens": 1650075771.0, + "step": 3227 + }, + { + "epoch": 0.8729042725797729, + "grad_norm": 2.765625, + "learning_rate": 0.01903207944872385, + "loss": 3.2409, + "mean_token_accuracy": 0.3904421329498291, + "num_tokens": 1650599954.0, + "step": 3228 + }, + { + "epoch": 0.8731746890210925, + "grad_norm": 2.484375, + "learning_rate": 0.019031368108412754, + "loss": 3.1888, + "mean_token_accuracy": 0.37361615896224976, + "num_tokens": 1651124074.0, + "step": 3229 + }, + { + "epoch": 0.8734451054624122, + "grad_norm": 2.78125, + "learning_rate": 0.019030656521675675, + "loss": 3.3604, + "mean_token_accuracy": 0.37980222702026367, + "num_tokens": 1651623273.0, + "step": 3230 + }, + { + "epoch": 0.8737155219037317, + "grad_norm": 123.5, + "learning_rate": 0.019029944688534427, + "loss": 12.2169, + "mean_token_accuracy": 1.4343087059387472e-05, + "num_tokens": 1652147448.0, + "step": 3231 + }, + { + "epoch": 0.8739859383450513, + "grad_norm": 6.96875, + "learning_rate": 0.019029232609010866, + "loss": 4.1572, + "mean_token_accuracy": 0.29424503445625305, + "num_tokens": 1652671608.0, + "step": 3232 + }, + { + "epoch": 0.874256354786371, + "grad_norm": 2.84375, + "learning_rate": 0.019028520283126838, + "loss": 3.5569, + "mean_token_accuracy": 0.3450602889060974, + "num_tokens": 1653171493.0, + "step": 3233 + }, + { + "epoch": 0.8745267712276906, + "grad_norm": 2.984375, + "learning_rate": 0.019027807710904194, + "loss": 3.4618, + "mean_token_accuracy": 0.36516672372817993, + "num_tokens": 1653695657.0, + "step": 3234 + }, + { + "epoch": 0.8747971876690103, + "grad_norm": 2.875, + "learning_rate": 0.0190270948923648, + "loss": 3.47, + "mean_token_accuracy": 0.3666486442089081, + "num_tokens": 1654204048.0, + "step": 3235 + }, + { + "epoch": 0.8750676041103299, + "grad_norm": 3.640625, + "learning_rate": 0.019026381827530536, + "loss": 3.4184, + "mean_token_accuracy": 0.3670092821121216, + "num_tokens": 1654728102.0, + "step": 3236 + }, + { + "epoch": 0.8753380205516496, + "grad_norm": 3.390625, + "learning_rate": 0.019025668516423268, + "loss": 3.3755, + "mean_token_accuracy": 0.3846244513988495, + "num_tokens": 1655252377.0, + "step": 3237 + }, + { + "epoch": 0.8756084369929692, + "grad_norm": 3.28125, + "learning_rate": 0.019024954959064887, + "loss": 3.5428, + "mean_token_accuracy": 0.35425496101379395, + "num_tokens": 1655776570.0, + "step": 3238 + }, + { + "epoch": 0.8758788534342888, + "grad_norm": 2.953125, + "learning_rate": 0.019024241155477288, + "loss": 3.4062, + "mean_token_accuracy": 0.3662149906158447, + "num_tokens": 1656300826.0, + "step": 3239 + }, + { + "epoch": 0.8761492698756085, + "grad_norm": 2.875, + "learning_rate": 0.019023527105682374, + "loss": 3.4168, + "mean_token_accuracy": 0.36128631234169006, + "num_tokens": 1656825006.0, + "step": 3240 + }, + { + "epoch": 0.876419686316928, + "grad_norm": 3.625, + "learning_rate": 0.019022812809702052, + "loss": 3.5186, + "mean_token_accuracy": 0.3690946698188782, + "num_tokens": 1657311913.0, + "step": 3241 + }, + { + "epoch": 0.8766901027582477, + "grad_norm": 2.796875, + "learning_rate": 0.019022098267558244, + "loss": 3.2708, + "mean_token_accuracy": 0.38164395093917847, + "num_tokens": 1657836141.0, + "step": 3242 + }, + { + "epoch": 0.8769605191995673, + "grad_norm": 2.9375, + "learning_rate": 0.019021383479272864, + "loss": 3.2279, + "mean_token_accuracy": 0.37698468565940857, + "num_tokens": 1658360359.0, + "step": 3243 + }, + { + "epoch": 0.8772309356408869, + "grad_norm": 2.78125, + "learning_rate": 0.019020668444867852, + "loss": 3.5785, + "mean_token_accuracy": 0.35876673460006714, + "num_tokens": 1658884634.0, + "step": 3244 + }, + { + "epoch": 0.8775013520822066, + "grad_norm": 3.921875, + "learning_rate": 0.019019953164365146, + "loss": 3.4715, + "mean_token_accuracy": 0.3596104383468628, + "num_tokens": 1659376133.0, + "step": 3245 + }, + { + "epoch": 0.8777717685235262, + "grad_norm": 2.84375, + "learning_rate": 0.01901923763778669, + "loss": 3.4396, + "mean_token_accuracy": 0.3790147304534912, + "num_tokens": 1659894840.0, + "step": 3246 + }, + { + "epoch": 0.8780421849648459, + "grad_norm": 2.609375, + "learning_rate": 0.019018521865154438, + "loss": 3.4594, + "mean_token_accuracy": 0.40650442242622375, + "num_tokens": 1660364320.0, + "step": 3247 + }, + { + "epoch": 0.8783126014061655, + "grad_norm": 5.84375, + "learning_rate": 0.019017805846490356, + "loss": 3.1003, + "mean_token_accuracy": 0.41264116764068604, + "num_tokens": 1660874089.0, + "step": 3248 + }, + { + "epoch": 0.8785830178474852, + "grad_norm": 1.7890625, + "learning_rate": 0.019017089581816412, + "loss": 3.4555, + "mean_token_accuracy": 0.3663625717163086, + "num_tokens": 1661398237.0, + "step": 3249 + }, + { + "epoch": 0.8788534342888048, + "grad_norm": 3.609375, + "learning_rate": 0.019016373071154585, + "loss": 3.5328, + "mean_token_accuracy": 0.3581573963165283, + "num_tokens": 1661922498.0, + "step": 3250 + }, + { + "epoch": 0.8791238507301244, + "grad_norm": 32.5, + "learning_rate": 0.019015656314526858, + "loss": 12.4506, + "mean_token_accuracy": 2.6049806365335826e-06, + "num_tokens": 1662416055.0, + "step": 3251 + }, + { + "epoch": 0.879394267171444, + "grad_norm": 7.125, + "learning_rate": 0.01901493931195522, + "loss": 4.0363, + "mean_token_accuracy": 0.3403441309928894, + "num_tokens": 1662940322.0, + "step": 3252 + }, + { + "epoch": 0.8796646836127636, + "grad_norm": 2.484375, + "learning_rate": 0.019014222063461674, + "loss": 3.4386, + "mean_token_accuracy": 0.3741183280944824, + "num_tokens": 1663430053.0, + "step": 3253 + }, + { + "epoch": 0.8799351000540833, + "grad_norm": 3.703125, + "learning_rate": 0.019013504569068224, + "loss": 3.3276, + "mean_token_accuracy": 0.35632413625717163, + "num_tokens": 1663954218.0, + "step": 3254 + }, + { + "epoch": 0.8802055164954029, + "grad_norm": 3.453125, + "learning_rate": 0.019012786828796892, + "loss": 3.5339, + "mean_token_accuracy": 0.34463581442832947, + "num_tokens": 1664475497.0, + "step": 3255 + }, + { + "epoch": 0.8804759329367225, + "grad_norm": 3.28125, + "learning_rate": 0.019012068842669698, + "loss": 3.6399, + "mean_token_accuracy": 0.3720252513885498, + "num_tokens": 1664960729.0, + "step": 3256 + }, + { + "epoch": 0.8807463493780422, + "grad_norm": 2.859375, + "learning_rate": 0.01901135061070866, + "loss": 3.4619, + "mean_token_accuracy": 0.37243586778640747, + "num_tokens": 1665484925.0, + "step": 3257 + }, + { + "epoch": 0.8810167658193618, + "grad_norm": 3.640625, + "learning_rate": 0.019010632132935835, + "loss": 3.5699, + "mean_token_accuracy": 0.3541313409805298, + "num_tokens": 1666009097.0, + "step": 3258 + }, + { + "epoch": 0.8812871822606815, + "grad_norm": 3.8125, + "learning_rate": 0.019009913409373257, + "loss": 3.575, + "mean_token_accuracy": 0.3611234724521637, + "num_tokens": 1666533335.0, + "step": 3259 + }, + { + "epoch": 0.8815575987020011, + "grad_norm": 18.125, + "learning_rate": 0.019009194440042978, + "loss": 3.3911, + "mean_token_accuracy": 0.35197317600250244, + "num_tokens": 1667057614.0, + "step": 3260 + }, + { + "epoch": 0.8818280151433208, + "grad_norm": 3.140625, + "learning_rate": 0.01900847522496706, + "loss": 3.4395, + "mean_token_accuracy": 0.3443101644515991, + "num_tokens": 1667581822.0, + "step": 3261 + }, + { + "epoch": 0.8820984315846403, + "grad_norm": 1.890625, + "learning_rate": 0.01900775576416757, + "loss": 3.2568, + "mean_token_accuracy": 0.3919598460197449, + "num_tokens": 1668054447.0, + "step": 3262 + }, + { + "epoch": 0.8823688480259599, + "grad_norm": 2.9375, + "learning_rate": 0.019007036057666588, + "loss": 3.6149, + "mean_token_accuracy": 0.3679783344268799, + "num_tokens": 1668568646.0, + "step": 3263 + }, + { + "epoch": 0.8826392644672796, + "grad_norm": 3.125, + "learning_rate": 0.019006316105486185, + "loss": 3.6023, + "mean_token_accuracy": 0.3674732446670532, + "num_tokens": 1669092923.0, + "step": 3264 + }, + { + "epoch": 0.8829096809085992, + "grad_norm": 2.8125, + "learning_rate": 0.019005595907648467, + "loss": 3.5072, + "mean_token_accuracy": 0.3568364977836609, + "num_tokens": 1669617142.0, + "step": 3265 + }, + { + "epoch": 0.8831800973499189, + "grad_norm": 3.09375, + "learning_rate": 0.01900487546417552, + "loss": 3.6092, + "mean_token_accuracy": 0.372692346572876, + "num_tokens": 1670101798.0, + "step": 3266 + }, + { + "epoch": 0.8834505137912385, + "grad_norm": 2.96875, + "learning_rate": 0.01900415477508945, + "loss": 3.4336, + "mean_token_accuracy": 0.4150204062461853, + "num_tokens": 1670520193.0, + "step": 3267 + }, + { + "epoch": 0.8837209302325582, + "grad_norm": 2.984375, + "learning_rate": 0.01900343384041238, + "loss": 3.5523, + "mean_token_accuracy": 0.35524803400039673, + "num_tokens": 1671044455.0, + "step": 3268 + }, + { + "epoch": 0.8839913466738778, + "grad_norm": 2.9375, + "learning_rate": 0.019002712660166416, + "loss": 3.5577, + "mean_token_accuracy": 0.3707793354988098, + "num_tokens": 1671568730.0, + "step": 3269 + }, + { + "epoch": 0.8842617631151974, + "grad_norm": 2.828125, + "learning_rate": 0.019001991234373697, + "loss": 3.2152, + "mean_token_accuracy": 0.37648850679397583, + "num_tokens": 1672092900.0, + "step": 3270 + }, + { + "epoch": 0.8845321795565171, + "grad_norm": 10.5625, + "learning_rate": 0.019001269563056356, + "loss": 10.2705, + "mean_token_accuracy": 5.069451617600862e-06, + "num_tokens": 1672617079.0, + "step": 3271 + }, + { + "epoch": 0.8848025959978366, + "grad_norm": 5.90625, + "learning_rate": 0.019000547646236533, + "loss": 3.8258, + "mean_token_accuracy": 0.3413878083229065, + "num_tokens": 1673141350.0, + "step": 3272 + }, + { + "epoch": 0.8850730124391563, + "grad_norm": 2.59375, + "learning_rate": 0.018999825483936375, + "loss": 3.2499, + "mean_token_accuracy": 0.3810777962207794, + "num_tokens": 1673615023.0, + "step": 3273 + }, + { + "epoch": 0.8853434288804759, + "grad_norm": 2.8125, + "learning_rate": 0.01899910307617805, + "loss": 3.4501, + "mean_token_accuracy": 0.36764734983444214, + "num_tokens": 1674139289.0, + "step": 3274 + }, + { + "epoch": 0.8856138453217955, + "grad_norm": 3.609375, + "learning_rate": 0.01899838042298372, + "loss": 3.4399, + "mean_token_accuracy": 0.3787887692451477, + "num_tokens": 1674603358.0, + "step": 3275 + }, + { + "epoch": 0.8858842617631152, + "grad_norm": 2.953125, + "learning_rate": 0.018997657524375555, + "loss": 3.4675, + "mean_token_accuracy": 0.3679616451263428, + "num_tokens": 1675127536.0, + "step": 3276 + }, + { + "epoch": 0.8861546782044348, + "grad_norm": 3.21875, + "learning_rate": 0.018996934380375738, + "loss": 3.5262, + "mean_token_accuracy": 0.36807847023010254, + "num_tokens": 1675651805.0, + "step": 3277 + }, + { + "epoch": 0.8864250946457545, + "grad_norm": 4.03125, + "learning_rate": 0.018996210991006453, + "loss": 3.5405, + "mean_token_accuracy": 0.39055120944976807, + "num_tokens": 1676120189.0, + "step": 3278 + }, + { + "epoch": 0.8866955110870741, + "grad_norm": 3.890625, + "learning_rate": 0.018995487356289906, + "loss": 3.0953, + "mean_token_accuracy": 0.4426279067993164, + "num_tokens": 1676583526.0, + "step": 3279 + }, + { + "epoch": 0.8869659275283938, + "grad_norm": 3.40625, + "learning_rate": 0.018994763476248292, + "loss": 3.483, + "mean_token_accuracy": 0.3721836805343628, + "num_tokens": 1677107799.0, + "step": 3280 + }, + { + "epoch": 0.8872363439697134, + "grad_norm": 4.125, + "learning_rate": 0.018994039350903824, + "loss": 3.7067, + "mean_token_accuracy": 0.3496251702308655, + "num_tokens": 1677632028.0, + "step": 3281 + }, + { + "epoch": 0.8875067604110329, + "grad_norm": 3.421875, + "learning_rate": 0.018993314980278717, + "loss": 3.4476, + "mean_token_accuracy": 0.3806725740432739, + "num_tokens": 1678156123.0, + "step": 3282 + }, + { + "epoch": 0.8877771768523526, + "grad_norm": 3.25, + "learning_rate": 0.0189925903643952, + "loss": 3.3815, + "mean_token_accuracy": 0.3793505132198334, + "num_tokens": 1678680269.0, + "step": 3283 + }, + { + "epoch": 0.8880475932936722, + "grad_norm": 2.859375, + "learning_rate": 0.018991865503275508, + "loss": 3.2798, + "mean_token_accuracy": 0.37663549184799194, + "num_tokens": 1679204498.0, + "step": 3284 + }, + { + "epoch": 0.8883180097349919, + "grad_norm": 2.5625, + "learning_rate": 0.01899114039694188, + "loss": 3.2945, + "mean_token_accuracy": 0.38527509570121765, + "num_tokens": 1679707306.0, + "step": 3285 + }, + { + "epoch": 0.8885884261763115, + "grad_norm": 3.84375, + "learning_rate": 0.018990415045416563, + "loss": 3.3938, + "mean_token_accuracy": 0.39480650424957275, + "num_tokens": 1680187603.0, + "step": 3286 + }, + { + "epoch": 0.8888588426176312, + "grad_norm": 3.671875, + "learning_rate": 0.018989689448721814, + "loss": 3.2151, + "mean_token_accuracy": 0.36670881509780884, + "num_tokens": 1680711757.0, + "step": 3287 + }, + { + "epoch": 0.8891292590589508, + "grad_norm": 2.9375, + "learning_rate": 0.018988963606879895, + "loss": 3.2277, + "mean_token_accuracy": 0.37714362144470215, + "num_tokens": 1681236026.0, + "step": 3288 + }, + { + "epoch": 0.8893996755002704, + "grad_norm": 3.203125, + "learning_rate": 0.01898823751991308, + "loss": 3.5387, + "mean_token_accuracy": 0.3471837043762207, + "num_tokens": 1681760234.0, + "step": 3289 + }, + { + "epoch": 0.8896700919415901, + "grad_norm": 3.5625, + "learning_rate": 0.018987511187843645, + "loss": 3.5715, + "mean_token_accuracy": 0.3502863645553589, + "num_tokens": 1682284471.0, + "step": 3290 + }, + { + "epoch": 0.8899405083829097, + "grad_norm": 110.5, + "learning_rate": 0.01898678461069388, + "loss": 12.7163, + "mean_token_accuracy": 0.0003161305212415755, + "num_tokens": 1682808562.0, + "step": 3291 + }, + { + "epoch": 0.8902109248242294, + "grad_norm": 5.5, + "learning_rate": 0.018986057788486072, + "loss": 3.8845, + "mean_token_accuracy": 0.3157777190208435, + "num_tokens": 1683332628.0, + "step": 3292 + }, + { + "epoch": 0.8904813412655489, + "grad_norm": 2.046875, + "learning_rate": 0.018985330721242526, + "loss": 3.29, + "mean_token_accuracy": 0.3713594079017639, + "num_tokens": 1683856838.0, + "step": 3293 + }, + { + "epoch": 0.8907517577068685, + "grad_norm": 2.53125, + "learning_rate": 0.018984603408985553, + "loss": 3.4303, + "mean_token_accuracy": 0.35395246744155884, + "num_tokens": 1684364056.0, + "step": 3294 + }, + { + "epoch": 0.8910221741481882, + "grad_norm": 2.84375, + "learning_rate": 0.018983875851737466, + "loss": 3.4006, + "mean_token_accuracy": 0.3690222203731537, + "num_tokens": 1684888321.0, + "step": 3295 + }, + { + "epoch": 0.8912925905895078, + "grad_norm": 3.671875, + "learning_rate": 0.018983148049520587, + "loss": 3.2377, + "mean_token_accuracy": 0.36236587166786194, + "num_tokens": 1685412517.0, + "step": 3296 + }, + { + "epoch": 0.8915630070308275, + "grad_norm": 2.984375, + "learning_rate": 0.01898242000235725, + "loss": 3.312, + "mean_token_accuracy": 0.37755680084228516, + "num_tokens": 1685936765.0, + "step": 3297 + }, + { + "epoch": 0.8918334234721471, + "grad_norm": 2.796875, + "learning_rate": 0.01898169171026979, + "loss": 3.5409, + "mean_token_accuracy": 0.37833505868911743, + "num_tokens": 1686433886.0, + "step": 3298 + }, + { + "epoch": 0.8921038399134668, + "grad_norm": 2.53125, + "learning_rate": 0.018980963173280557, + "loss": 3.439, + "mean_token_accuracy": 0.3770987391471863, + "num_tokens": 1686958004.0, + "step": 3299 + }, + { + "epoch": 0.8923742563547864, + "grad_norm": 2.21875, + "learning_rate": 0.018980234391411905, + "loss": 3.3075, + "mean_token_accuracy": 0.38777726888656616, + "num_tokens": 1687465880.0, + "step": 3300 + }, + { + "epoch": 0.892644672796106, + "grad_norm": 1.9921875, + "learning_rate": 0.018979505364686195, + "loss": 3.4359, + "mean_token_accuracy": 0.355488121509552, + "num_tokens": 1687990074.0, + "step": 3301 + }, + { + "epoch": 0.8929150892374257, + "grad_norm": 2.65625, + "learning_rate": 0.018978776093125788, + "loss": 3.3659, + "mean_token_accuracy": 0.38712140917778015, + "num_tokens": 1688469941.0, + "step": 3302 + }, + { + "epoch": 0.8931855056787452, + "grad_norm": 2.609375, + "learning_rate": 0.01897804657675307, + "loss": 3.1419, + "mean_token_accuracy": 0.3846103847026825, + "num_tokens": 1688948583.0, + "step": 3303 + }, + { + "epoch": 0.8934559221200649, + "grad_norm": 3.1875, + "learning_rate": 0.018977316815590423, + "loss": 3.3271, + "mean_token_accuracy": 0.3837285041809082, + "num_tokens": 1689432395.0, + "step": 3304 + }, + { + "epoch": 0.8937263385613845, + "grad_norm": 2.78125, + "learning_rate": 0.018976586809660235, + "loss": 3.2669, + "mean_token_accuracy": 0.3752034306526184, + "num_tokens": 1689931914.0, + "step": 3305 + }, + { + "epoch": 0.8939967550027041, + "grad_norm": 2.109375, + "learning_rate": 0.018975856558984906, + "loss": 3.0485, + "mean_token_accuracy": 0.416617751121521, + "num_tokens": 1690415866.0, + "step": 3306 + }, + { + "epoch": 0.8942671714440238, + "grad_norm": 3.25, + "learning_rate": 0.01897512606358684, + "loss": 3.3291, + "mean_token_accuracy": 0.3837175965309143, + "num_tokens": 1690892915.0, + "step": 3307 + }, + { + "epoch": 0.8945375878853434, + "grad_norm": 3.65625, + "learning_rate": 0.018974395323488453, + "loss": 3.4495, + "mean_token_accuracy": 0.3923073410987854, + "num_tokens": 1691395385.0, + "step": 3308 + }, + { + "epoch": 0.8948080043266631, + "grad_norm": 3.125, + "learning_rate": 0.01897366433871217, + "loss": 3.3574, + "mean_token_accuracy": 0.37651708722114563, + "num_tokens": 1691862367.0, + "step": 3309 + }, + { + "epoch": 0.8950784207679827, + "grad_norm": 2.796875, + "learning_rate": 0.018972933109280415, + "loss": 3.2551, + "mean_token_accuracy": 0.38553881645202637, + "num_tokens": 1692386486.0, + "step": 3310 + }, + { + "epoch": 0.8953488372093024, + "grad_norm": 45.25, + "learning_rate": 0.018972201635215624, + "loss": 11.7818, + "mean_token_accuracy": 0.03569548577070236, + "num_tokens": 1692863921.0, + "step": 3311 + }, + { + "epoch": 0.895619253650622, + "grad_norm": 6.90625, + "learning_rate": 0.018971469916540244, + "loss": 4.1049, + "mean_token_accuracy": 0.2945352792739868, + "num_tokens": 1693353717.0, + "step": 3312 + }, + { + "epoch": 0.8958896700919415, + "grad_norm": 2.90625, + "learning_rate": 0.018970737953276723, + "loss": 3.5364, + "mean_token_accuracy": 0.33584314584732056, + "num_tokens": 1693877869.0, + "step": 3313 + }, + { + "epoch": 0.8961600865332612, + "grad_norm": 3.421875, + "learning_rate": 0.01897000574544752, + "loss": 3.5472, + "mean_token_accuracy": 0.3696840703487396, + "num_tokens": 1694370856.0, + "step": 3314 + }, + { + "epoch": 0.8964305029745808, + "grad_norm": 3.15625, + "learning_rate": 0.0189692732930751, + "loss": 3.2883, + "mean_token_accuracy": 0.3707146942615509, + "num_tokens": 1694895041.0, + "step": 3315 + }, + { + "epoch": 0.8967009194159005, + "grad_norm": 3.90625, + "learning_rate": 0.018968540596181947, + "loss": 3.5318, + "mean_token_accuracy": 0.33389514684677124, + "num_tokens": 1695419271.0, + "step": 3316 + }, + { + "epoch": 0.8969713358572201, + "grad_norm": 3.921875, + "learning_rate": 0.01896780765479053, + "loss": 3.5123, + "mean_token_accuracy": 0.37090471386909485, + "num_tokens": 1695943418.0, + "step": 3317 + }, + { + "epoch": 0.8972417522985398, + "grad_norm": 4.21875, + "learning_rate": 0.018967074468923345, + "loss": 3.298, + "mean_token_accuracy": 0.3577435314655304, + "num_tokens": 1696467627.0, + "step": 3318 + }, + { + "epoch": 0.8975121687398594, + "grad_norm": 3.09375, + "learning_rate": 0.018966341038602882, + "loss": 3.5172, + "mean_token_accuracy": 0.37954843044281006, + "num_tokens": 1696950949.0, + "step": 3319 + }, + { + "epoch": 0.897782585181179, + "grad_norm": 3.4375, + "learning_rate": 0.018965607363851653, + "loss": 3.5377, + "mean_token_accuracy": 0.37493789196014404, + "num_tokens": 1697436670.0, + "step": 3320 + }, + { + "epoch": 0.8980530016224987, + "grad_norm": 3.078125, + "learning_rate": 0.01896487344469216, + "loss": 3.4244, + "mean_token_accuracy": 0.3764674961566925, + "num_tokens": 1697960876.0, + "step": 3321 + }, + { + "epoch": 0.8983234180638183, + "grad_norm": 4.34375, + "learning_rate": 0.018964139281146927, + "loss": 3.6446, + "mean_token_accuracy": 0.35342615842819214, + "num_tokens": 1698480394.0, + "step": 3322 + }, + { + "epoch": 0.898593834505138, + "grad_norm": 3.0, + "learning_rate": 0.018963404873238485, + "loss": 3.3941, + "mean_token_accuracy": 0.33580756187438965, + "num_tokens": 1699004479.0, + "step": 3323 + }, + { + "epoch": 0.8988642509464575, + "grad_norm": 2.765625, + "learning_rate": 0.018962670220989356, + "loss": 3.2141, + "mean_token_accuracy": 0.3994113802909851, + "num_tokens": 1699468290.0, + "step": 3324 + }, + { + "epoch": 0.8991346673877771, + "grad_norm": 2.734375, + "learning_rate": 0.01896193532442209, + "loss": 3.2659, + "mean_token_accuracy": 0.3762575387954712, + "num_tokens": 1699966113.0, + "step": 3325 + }, + { + "epoch": 0.8994050838290968, + "grad_norm": 2.8125, + "learning_rate": 0.018961200183559238, + "loss": 3.0312, + "mean_token_accuracy": 0.3738456666469574, + "num_tokens": 1700490329.0, + "step": 3326 + }, + { + "epoch": 0.8996755002704164, + "grad_norm": 2.84375, + "learning_rate": 0.018960464798423348, + "loss": 3.3522, + "mean_token_accuracy": 0.375493586063385, + "num_tokens": 1700993044.0, + "step": 3327 + }, + { + "epoch": 0.8999459167117361, + "grad_norm": 7.90625, + "learning_rate": 0.018959729169036984, + "loss": 3.295, + "mean_token_accuracy": 0.3788338005542755, + "num_tokens": 1701517277.0, + "step": 3328 + }, + { + "epoch": 0.9002163331530557, + "grad_norm": 1.9296875, + "learning_rate": 0.018958993295422732, + "loss": 3.3345, + "mean_token_accuracy": 0.372702956199646, + "num_tokens": 1702041474.0, + "step": 3329 + }, + { + "epoch": 0.9004867495943754, + "grad_norm": 3.078125, + "learning_rate": 0.01895825717760315, + "loss": 3.2574, + "mean_token_accuracy": 0.38987332582473755, + "num_tokens": 1702524933.0, + "step": 3330 + }, + { + "epoch": 0.900757166035695, + "grad_norm": 16.375, + "learning_rate": 0.01895752081560084, + "loss": 14.806, + "mean_token_accuracy": 0.00036528409691527486, + "num_tokens": 1703049172.0, + "step": 3331 + }, + { + "epoch": 0.9010275824770146, + "grad_norm": 5.21875, + "learning_rate": 0.018956784209438384, + "loss": 3.5046, + "mean_token_accuracy": 0.33880266547203064, + "num_tokens": 1703541320.0, + "step": 3332 + }, + { + "epoch": 0.9012979989183343, + "grad_norm": 2.484375, + "learning_rate": 0.018956047359138393, + "loss": 3.3191, + "mean_token_accuracy": 0.36508381366729736, + "num_tokens": 1704065576.0, + "step": 3333 + }, + { + "epoch": 0.9015684153596538, + "grad_norm": 3.0625, + "learning_rate": 0.018955310264723472, + "loss": 3.481, + "mean_token_accuracy": 0.3718896210193634, + "num_tokens": 1704589718.0, + "step": 3334 + }, + { + "epoch": 0.9018388318009735, + "grad_norm": 3.1875, + "learning_rate": 0.018954572926216235, + "loss": 3.1836, + "mean_token_accuracy": 0.3810073733329773, + "num_tokens": 1705092184.0, + "step": 3335 + }, + { + "epoch": 0.9021092482422931, + "grad_norm": 2.625, + "learning_rate": 0.018953835343639307, + "loss": 3.2823, + "mean_token_accuracy": 0.37682437896728516, + "num_tokens": 1705616326.0, + "step": 3336 + }, + { + "epoch": 0.9023796646836127, + "grad_norm": 3.46875, + "learning_rate": 0.01895309751701532, + "loss": 3.278, + "mean_token_accuracy": 0.38312214612960815, + "num_tokens": 1706094729.0, + "step": 3337 + }, + { + "epoch": 0.9026500811249324, + "grad_norm": 3.125, + "learning_rate": 0.018952359446366914, + "loss": 3.6291, + "mean_token_accuracy": 0.39518362283706665, + "num_tokens": 1706555535.0, + "step": 3338 + }, + { + "epoch": 0.902920497566252, + "grad_norm": 2.828125, + "learning_rate": 0.01895162113171674, + "loss": 3.2693, + "mean_token_accuracy": 0.36660122871398926, + "num_tokens": 1707079693.0, + "step": 3339 + }, + { + "epoch": 0.9031909140075717, + "grad_norm": 2.671875, + "learning_rate": 0.018950882573087437, + "loss": 3.5346, + "mean_token_accuracy": 0.37240952253341675, + "num_tokens": 1707603798.0, + "step": 3340 + }, + { + "epoch": 0.9034613304488913, + "grad_norm": 3.125, + "learning_rate": 0.01895014377050168, + "loss": 3.5281, + "mean_token_accuracy": 0.38209134340286255, + "num_tokens": 1708006629.0, + "step": 3341 + }, + { + "epoch": 0.903731746890211, + "grad_norm": 2.890625, + "learning_rate": 0.018949404723982127, + "loss": 3.3812, + "mean_token_accuracy": 0.38129884004592896, + "num_tokens": 1708509688.0, + "step": 3342 + }, + { + "epoch": 0.9040021633315306, + "grad_norm": 3.015625, + "learning_rate": 0.018948665433551462, + "loss": 3.2843, + "mean_token_accuracy": 0.3704160153865814, + "num_tokens": 1709033885.0, + "step": 3343 + }, + { + "epoch": 0.9042725797728501, + "grad_norm": 2.59375, + "learning_rate": 0.018947925899232367, + "loss": 3.2767, + "mean_token_accuracy": 0.3721647262573242, + "num_tokens": 1709557939.0, + "step": 3344 + }, + { + "epoch": 0.9045429962141698, + "grad_norm": 4.5625, + "learning_rate": 0.01894718612104753, + "loss": 3.3872, + "mean_token_accuracy": 0.374585896730423, + "num_tokens": 1710033656.0, + "step": 3345 + }, + { + "epoch": 0.9048134126554894, + "grad_norm": 2.453125, + "learning_rate": 0.018946446099019656, + "loss": 3.3825, + "mean_token_accuracy": 0.39114728569984436, + "num_tokens": 1710518283.0, + "step": 3346 + }, + { + "epoch": 0.9050838290968091, + "grad_norm": 3.015625, + "learning_rate": 0.018945705833171442, + "loss": 3.2446, + "mean_token_accuracy": 0.3825148940086365, + "num_tokens": 1711042519.0, + "step": 3347 + }, + { + "epoch": 0.9053542455381287, + "grad_norm": 2.484375, + "learning_rate": 0.018944965323525607, + "loss": 3.4239, + "mean_token_accuracy": 0.3602648377418518, + "num_tokens": 1711510610.0, + "step": 3348 + }, + { + "epoch": 0.9056246619794484, + "grad_norm": 3.0, + "learning_rate": 0.018944224570104872, + "loss": 3.1664, + "mean_token_accuracy": 0.38663721084594727, + "num_tokens": 1712034859.0, + "step": 3349 + }, + { + "epoch": 0.905895078420768, + "grad_norm": 2.34375, + "learning_rate": 0.018943483572931963, + "loss": 3.2666, + "mean_token_accuracy": 0.39248913526535034, + "num_tokens": 1712559138.0, + "step": 3350 + }, + { + "epoch": 0.9061654948620876, + "grad_norm": 242.0, + "learning_rate": 0.018942742332029618, + "loss": 15.7103, + "mean_token_accuracy": 0.038963306695222855, + "num_tokens": 1713083331.0, + "step": 3351 + }, + { + "epoch": 0.9064359113034073, + "grad_norm": 7.59375, + "learning_rate": 0.018942000847420586, + "loss": 3.8556, + "mean_token_accuracy": 0.27234601974487305, + "num_tokens": 1713607390.0, + "step": 3352 + }, + { + "epoch": 0.9067063277447269, + "grad_norm": 3.203125, + "learning_rate": 0.018941259119127606, + "loss": 3.7589, + "mean_token_accuracy": 0.3210049867630005, + "num_tokens": 1714131542.0, + "step": 3353 + }, + { + "epoch": 0.9069767441860465, + "grad_norm": 3.484375, + "learning_rate": 0.018940517147173444, + "loss": 3.6311, + "mean_token_accuracy": 0.3589804768562317, + "num_tokens": 1714655636.0, + "step": 3354 + }, + { + "epoch": 0.9072471606273661, + "grad_norm": 2.96875, + "learning_rate": 0.018939774931580864, + "loss": 3.5384, + "mean_token_accuracy": 0.3626742362976074, + "num_tokens": 1715179916.0, + "step": 3355 + }, + { + "epoch": 0.9075175770686857, + "grad_norm": 3.84375, + "learning_rate": 0.018939032472372642, + "loss": 3.6165, + "mean_token_accuracy": 0.3528837561607361, + "num_tokens": 1715704011.0, + "step": 3356 + }, + { + "epoch": 0.9077879935100054, + "grad_norm": 2.4375, + "learning_rate": 0.018938289769571556, + "loss": 3.4349, + "mean_token_accuracy": 0.36683234572410583, + "num_tokens": 1716228276.0, + "step": 3357 + }, + { + "epoch": 0.908058409951325, + "grad_norm": 4.0625, + "learning_rate": 0.018937546823200393, + "loss": 3.5772, + "mean_token_accuracy": 0.3451668620109558, + "num_tokens": 1716752387.0, + "step": 3358 + }, + { + "epoch": 0.9083288263926447, + "grad_norm": 2.6875, + "learning_rate": 0.01893680363328195, + "loss": 3.3135, + "mean_token_accuracy": 0.3653799295425415, + "num_tokens": 1717276454.0, + "step": 3359 + }, + { + "epoch": 0.9085992428339643, + "grad_norm": 2.96875, + "learning_rate": 0.018936060199839032, + "loss": 3.3977, + "mean_token_accuracy": 0.36390793323516846, + "num_tokens": 1717800730.0, + "step": 3360 + }, + { + "epoch": 0.908869659275284, + "grad_norm": 2.640625, + "learning_rate": 0.018935316522894447, + "loss": 3.136, + "mean_token_accuracy": 0.3594456911087036, + "num_tokens": 1718324830.0, + "step": 3361 + }, + { + "epoch": 0.9091400757166036, + "grad_norm": 2.34375, + "learning_rate": 0.018934572602471018, + "loss": 3.4788, + "mean_token_accuracy": 0.37592747807502747, + "num_tokens": 1718849071.0, + "step": 3362 + }, + { + "epoch": 0.9094104921579232, + "grad_norm": 3.71875, + "learning_rate": 0.018933828438591566, + "loss": 3.467, + "mean_token_accuracy": 0.3714175224304199, + "num_tokens": 1719373356.0, + "step": 3363 + }, + { + "epoch": 0.9096809085992429, + "grad_norm": 2.6875, + "learning_rate": 0.01893308403127893, + "loss": 3.261, + "mean_token_accuracy": 0.36581912636756897, + "num_tokens": 1719897615.0, + "step": 3364 + }, + { + "epoch": 0.9099513250405624, + "grad_norm": 2.921875, + "learning_rate": 0.01893233938055594, + "loss": 3.365, + "mean_token_accuracy": 0.39333194494247437, + "num_tokens": 1720393324.0, + "step": 3365 + }, + { + "epoch": 0.9102217414818821, + "grad_norm": 3.671875, + "learning_rate": 0.018931594486445453, + "loss": 3.5437, + "mean_token_accuracy": 0.3565545976161957, + "num_tokens": 1720917522.0, + "step": 3366 + }, + { + "epoch": 0.9104921579232017, + "grad_norm": 2.296875, + "learning_rate": 0.01893084934897032, + "loss": 3.302, + "mean_token_accuracy": 0.375916063785553, + "num_tokens": 1721406395.0, + "step": 3367 + }, + { + "epoch": 0.9107625743645213, + "grad_norm": 2.890625, + "learning_rate": 0.01893010396815341, + "loss": 3.4598, + "mean_token_accuracy": 0.36178165674209595, + "num_tokens": 1721930572.0, + "step": 3368 + }, + { + "epoch": 0.911032990805841, + "grad_norm": 3.109375, + "learning_rate": 0.018929358344017586, + "loss": 3.3937, + "mean_token_accuracy": 0.3860280513763428, + "num_tokens": 1722454851.0, + "step": 3369 + }, + { + "epoch": 0.9113034072471606, + "grad_norm": 5.9375, + "learning_rate": 0.01892861247658573, + "loss": 2.9945, + "mean_token_accuracy": 0.39033329486846924, + "num_tokens": 1722979073.0, + "step": 3370 + }, + { + "epoch": 0.9115738236884803, + "grad_norm": 4.0, + "learning_rate": 0.018927866365880727, + "loss": 11.3095, + "mean_token_accuracy": 6.309714535746025e-06, + "num_tokens": 1723473409.0, + "step": 3371 + }, + { + "epoch": 0.9118442401297999, + "grad_norm": 8.4375, + "learning_rate": 0.018927120011925466, + "loss": 3.9164, + "mean_token_accuracy": 0.3001629710197449, + "num_tokens": 1723997685.0, + "step": 3372 + }, + { + "epoch": 0.9121146565711196, + "grad_norm": 3.65625, + "learning_rate": 0.018926373414742854, + "loss": 3.7419, + "mean_token_accuracy": 0.3293946087360382, + "num_tokens": 1724521924.0, + "step": 3373 + }, + { + "epoch": 0.9123850730124392, + "grad_norm": 3.578125, + "learning_rate": 0.018925626574355797, + "loss": 3.5745, + "mean_token_accuracy": 0.35672277212142944, + "num_tokens": 1725046107.0, + "step": 3374 + }, + { + "epoch": 0.9126554894537587, + "grad_norm": 2.4375, + "learning_rate": 0.018924879490787208, + "loss": 3.2912, + "mean_token_accuracy": 0.37295612692832947, + "num_tokens": 1725570238.0, + "step": 3375 + }, + { + "epoch": 0.9129259058950784, + "grad_norm": 3.328125, + "learning_rate": 0.018924132164060012, + "loss": 3.5724, + "mean_token_accuracy": 0.32883113622665405, + "num_tokens": 1726094472.0, + "step": 3376 + }, + { + "epoch": 0.913196322336398, + "grad_norm": 3.265625, + "learning_rate": 0.018923384594197134, + "loss": 3.3855, + "mean_token_accuracy": 0.35684460401535034, + "num_tokens": 1726602224.0, + "step": 3377 + }, + { + "epoch": 0.9134667387777177, + "grad_norm": 2.65625, + "learning_rate": 0.018922636781221518, + "loss": 3.4061, + "mean_token_accuracy": 0.3449009358882904, + "num_tokens": 1727126499.0, + "step": 3378 + }, + { + "epoch": 0.9137371552190373, + "grad_norm": 2.546875, + "learning_rate": 0.018921888725156103, + "loss": 3.2498, + "mean_token_accuracy": 0.3836894631385803, + "num_tokens": 1727650622.0, + "step": 3379 + }, + { + "epoch": 0.914007571660357, + "grad_norm": 3.0, + "learning_rate": 0.01892114042602385, + "loss": 3.3644, + "mean_token_accuracy": 0.3681790828704834, + "num_tokens": 1728169611.0, + "step": 3380 + }, + { + "epoch": 0.9142779881016766, + "grad_norm": 3.640625, + "learning_rate": 0.018920391883847712, + "loss": 3.2327, + "mean_token_accuracy": 0.44792047142982483, + "num_tokens": 1728635601.0, + "step": 3381 + }, + { + "epoch": 0.9145484045429962, + "grad_norm": 2.390625, + "learning_rate": 0.01891964309865066, + "loss": 3.3881, + "mean_token_accuracy": 0.3759072422981262, + "num_tokens": 1729159747.0, + "step": 3382 + }, + { + "epoch": 0.9148188209843159, + "grad_norm": 3.890625, + "learning_rate": 0.018918894070455664, + "loss": 3.5577, + "mean_token_accuracy": 0.36477023363113403, + "num_tokens": 1729683938.0, + "step": 3383 + }, + { + "epoch": 0.9150892374256355, + "grad_norm": 3.765625, + "learning_rate": 0.018918144799285715, + "loss": 3.5143, + "mean_token_accuracy": 0.3785942792892456, + "num_tokens": 1730171904.0, + "step": 3384 + }, + { + "epoch": 0.9153596538669551, + "grad_norm": 3.03125, + "learning_rate": 0.018917395285163795, + "loss": 3.4861, + "mean_token_accuracy": 0.3473418354988098, + "num_tokens": 1730696165.0, + "step": 3385 + }, + { + "epoch": 0.9156300703082747, + "grad_norm": 2.6875, + "learning_rate": 0.018916645528112907, + "loss": 3.4083, + "mean_token_accuracy": 0.378976047039032, + "num_tokens": 1731220404.0, + "step": 3386 + }, + { + "epoch": 0.9159004867495943, + "grad_norm": 3.71875, + "learning_rate": 0.01891589552815605, + "loss": 3.2837, + "mean_token_accuracy": 0.3536626994609833, + "num_tokens": 1731744481.0, + "step": 3387 + }, + { + "epoch": 0.916170903190914, + "grad_norm": 3.21875, + "learning_rate": 0.01891514528531624, + "loss": 3.1939, + "mean_token_accuracy": 0.40294331312179565, + "num_tokens": 1732240432.0, + "step": 3388 + }, + { + "epoch": 0.9164413196322336, + "grad_norm": 3.109375, + "learning_rate": 0.018914394799616496, + "loss": 3.3159, + "mean_token_accuracy": 0.38596633076667786, + "num_tokens": 1732714041.0, + "step": 3389 + }, + { + "epoch": 0.9167117360735533, + "grad_norm": 3.484375, + "learning_rate": 0.018913644071079844, + "loss": 3.2352, + "mean_token_accuracy": 0.3722374439239502, + "num_tokens": 1733238285.0, + "step": 3390 + }, + { + "epoch": 0.9169821525148729, + "grad_norm": 60.0, + "learning_rate": 0.01891289309972932, + "loss": 9.3946, + "mean_token_accuracy": 0.015522009693086147, + "num_tokens": 1733762480.0, + "step": 3391 + }, + { + "epoch": 0.9172525689561926, + "grad_norm": 4.84375, + "learning_rate": 0.01891214188558796, + "loss": 3.8083, + "mean_token_accuracy": 0.34902113676071167, + "num_tokens": 1734186945.0, + "step": 3392 + }, + { + "epoch": 0.9175229853975122, + "grad_norm": 2.578125, + "learning_rate": 0.01891139042867883, + "loss": 3.6315, + "mean_token_accuracy": 0.3525266647338867, + "num_tokens": 1734711214.0, + "step": 3393 + }, + { + "epoch": 0.9177934018388318, + "grad_norm": 3.765625, + "learning_rate": 0.018910638729024966, + "loss": 3.5198, + "mean_token_accuracy": 0.34339475631713867, + "num_tokens": 1735235367.0, + "step": 3394 + }, + { + "epoch": 0.9180638182801514, + "grad_norm": 2.890625, + "learning_rate": 0.018909886786649448, + "loss": 3.4153, + "mean_token_accuracy": 0.38203129172325134, + "num_tokens": 1735759461.0, + "step": 3395 + }, + { + "epoch": 0.918334234721471, + "grad_norm": 3.125, + "learning_rate": 0.018909134601575336, + "loss": 3.4623, + "mean_token_accuracy": 0.3483876883983612, + "num_tokens": 1736283717.0, + "step": 3396 + }, + { + "epoch": 0.9186046511627907, + "grad_norm": 3.09375, + "learning_rate": 0.01890838217382572, + "loss": 3.5985, + "mean_token_accuracy": 0.3719058036804199, + "num_tokens": 1736746511.0, + "step": 3397 + }, + { + "epoch": 0.9188750676041103, + "grad_norm": 2.21875, + "learning_rate": 0.018907629503423683, + "loss": 3.2487, + "mean_token_accuracy": 0.37307971715927124, + "num_tokens": 1737270754.0, + "step": 3398 + }, + { + "epoch": 0.91914548404543, + "grad_norm": 2.84375, + "learning_rate": 0.01890687659039231, + "loss": 3.3857, + "mean_token_accuracy": 0.3735074996948242, + "num_tokens": 1737794923.0, + "step": 3399 + }, + { + "epoch": 0.9194159004867496, + "grad_norm": 3.078125, + "learning_rate": 0.018906123434754722, + "loss": 3.3718, + "mean_token_accuracy": 0.3846474885940552, + "num_tokens": 1738319186.0, + "step": 3400 + }, + { + "epoch": 0.9196863169280692, + "grad_norm": 2.734375, + "learning_rate": 0.01890537003653401, + "loss": 3.359, + "mean_token_accuracy": 0.376949667930603, + "num_tokens": 1738818523.0, + "step": 3401 + }, + { + "epoch": 0.9199567333693889, + "grad_norm": 3.390625, + "learning_rate": 0.018904616395753297, + "loss": 3.515, + "mean_token_accuracy": 0.3433181643486023, + "num_tokens": 1739342605.0, + "step": 3402 + }, + { + "epoch": 0.9202271498107085, + "grad_norm": 2.078125, + "learning_rate": 0.01890386251243571, + "loss": 3.4549, + "mean_token_accuracy": 0.4044555425643921, + "num_tokens": 1739802349.0, + "step": 3403 + }, + { + "epoch": 0.9204975662520282, + "grad_norm": 2.703125, + "learning_rate": 0.018903108386604372, + "loss": 3.3621, + "mean_token_accuracy": 0.36271077394485474, + "num_tokens": 1740326243.0, + "step": 3404 + }, + { + "epoch": 0.9207679826933478, + "grad_norm": 2.671875, + "learning_rate": 0.01890235401828243, + "loss": 3.0552, + "mean_token_accuracy": 0.4018078148365021, + "num_tokens": 1740850383.0, + "step": 3405 + }, + { + "epoch": 0.9210383991346673, + "grad_norm": 5.78125, + "learning_rate": 0.01890159940749303, + "loss": 3.2146, + "mean_token_accuracy": 0.3815043568611145, + "num_tokens": 1741374648.0, + "step": 3406 + }, + { + "epoch": 0.921308815575987, + "grad_norm": 1.734375, + "learning_rate": 0.01890084455425932, + "loss": 3.2422, + "mean_token_accuracy": 0.3883063793182373, + "num_tokens": 1741898821.0, + "step": 3407 + }, + { + "epoch": 0.9215792320173066, + "grad_norm": 4.15625, + "learning_rate": 0.018900089458604465, + "loss": 3.4604, + "mean_token_accuracy": 0.36319500207901, + "num_tokens": 1742413109.0, + "step": 3408 + }, + { + "epoch": 0.9218496484586263, + "grad_norm": 2.46875, + "learning_rate": 0.018899334120551635, + "loss": 3.2821, + "mean_token_accuracy": 0.3885047435760498, + "num_tokens": 1742935164.0, + "step": 3409 + }, + { + "epoch": 0.9221200648999459, + "grad_norm": 3.671875, + "learning_rate": 0.018898578540124, + "loss": 3.5105, + "mean_token_accuracy": 0.3730197548866272, + "num_tokens": 1743459422.0, + "step": 3410 + }, + { + "epoch": 0.9223904813412656, + "grad_norm": 0.66796875, + "learning_rate": 0.01889782271734475, + "loss": 11.053, + "mean_token_accuracy": 2.077300268865656e-05, + "num_tokens": 1743983591.0, + "step": 3411 + }, + { + "epoch": 0.9226608977825852, + "grad_norm": 8.5625, + "learning_rate": 0.01889706665223707, + "loss": 4.0834, + "mean_token_accuracy": 0.3054622411727905, + "num_tokens": 1744507812.0, + "step": 3412 + }, + { + "epoch": 0.9229313142239048, + "grad_norm": 2.625, + "learning_rate": 0.018896310344824165, + "loss": 3.4815, + "mean_token_accuracy": 0.3623987138271332, + "num_tokens": 1745031995.0, + "step": 3413 + }, + { + "epoch": 0.9232017306652245, + "grad_norm": 4.125, + "learning_rate": 0.018895553795129236, + "loss": 3.5072, + "mean_token_accuracy": 0.335814893245697, + "num_tokens": 1745556028.0, + "step": 3414 + }, + { + "epoch": 0.9234721471065441, + "grad_norm": 3.125, + "learning_rate": 0.018894797003175496, + "loss": 3.4514, + "mean_token_accuracy": 0.3649982213973999, + "num_tokens": 1746080274.0, + "step": 3415 + }, + { + "epoch": 0.9237425635478637, + "grad_norm": 3.1875, + "learning_rate": 0.018894039968986167, + "loss": 3.5924, + "mean_token_accuracy": 0.33648252487182617, + "num_tokens": 1746604551.0, + "step": 3416 + }, + { + "epoch": 0.9240129799891833, + "grad_norm": 3.53125, + "learning_rate": 0.018893282692584477, + "loss": 3.4143, + "mean_token_accuracy": 0.37137141823768616, + "num_tokens": 1747046520.0, + "step": 3417 + }, + { + "epoch": 0.924283396430503, + "grad_norm": 3.328125, + "learning_rate": 0.018892525173993664, + "loss": 3.5618, + "mean_token_accuracy": 0.35799866914749146, + "num_tokens": 1747512962.0, + "step": 3418 + }, + { + "epoch": 0.9245538128718226, + "grad_norm": 2.640625, + "learning_rate": 0.018891767413236966, + "loss": 3.3359, + "mean_token_accuracy": 0.3524065911769867, + "num_tokens": 1748037235.0, + "step": 3419 + }, + { + "epoch": 0.9248242293131422, + "grad_norm": 16.25, + "learning_rate": 0.018891009410337636, + "loss": 3.7005, + "mean_token_accuracy": 0.35588696599006653, + "num_tokens": 1748561519.0, + "step": 3420 + }, + { + "epoch": 0.9250946457544619, + "grad_norm": 3.765625, + "learning_rate": 0.01889025116531893, + "loss": 3.4533, + "mean_token_accuracy": 0.33053767681121826, + "num_tokens": 1749085715.0, + "step": 3421 + }, + { + "epoch": 0.9253650621957815, + "grad_norm": 2.3125, + "learning_rate": 0.018889492678204115, + "loss": 3.4487, + "mean_token_accuracy": 0.3967435657978058, + "num_tokens": 1749603838.0, + "step": 3422 + }, + { + "epoch": 0.9256354786371012, + "grad_norm": 3.46875, + "learning_rate": 0.018888733949016464, + "loss": 3.6219, + "mean_token_accuracy": 0.3480437994003296, + "num_tokens": 1750127989.0, + "step": 3423 + }, + { + "epoch": 0.9259058950784208, + "grad_norm": 2.84375, + "learning_rate": 0.018887974977779255, + "loss": 3.3966, + "mean_token_accuracy": 0.35328468680381775, + "num_tokens": 1750652100.0, + "step": 3424 + }, + { + "epoch": 0.9261763115197404, + "grad_norm": 3.0625, + "learning_rate": 0.018887215764515776, + "loss": 3.4701, + "mean_token_accuracy": 0.33770954608917236, + "num_tokens": 1751176247.0, + "step": 3425 + }, + { + "epoch": 0.92644672796106, + "grad_norm": 2.890625, + "learning_rate": 0.018886456309249327, + "loss": 3.5024, + "mean_token_accuracy": 0.3750672936439514, + "num_tokens": 1751639870.0, + "step": 3426 + }, + { + "epoch": 0.9267171444023796, + "grad_norm": 3.171875, + "learning_rate": 0.018885696612003202, + "loss": 3.4711, + "mean_token_accuracy": 0.35230931639671326, + "num_tokens": 1752164073.0, + "step": 3427 + }, + { + "epoch": 0.9269875608436993, + "grad_norm": 3.0, + "learning_rate": 0.018884936672800716, + "loss": 3.0029, + "mean_token_accuracy": 0.41625458002090454, + "num_tokens": 1752688245.0, + "step": 3428 + }, + { + "epoch": 0.9272579772850189, + "grad_norm": 3.21875, + "learning_rate": 0.018884176491665183, + "loss": 3.3995, + "mean_token_accuracy": 0.36647850275039673, + "num_tokens": 1753173039.0, + "step": 3429 + }, + { + "epoch": 0.9275283937263386, + "grad_norm": 3.359375, + "learning_rate": 0.01888341606861993, + "loss": 3.2336, + "mean_token_accuracy": 0.3732655644416809, + "num_tokens": 1753697219.0, + "step": 3430 + }, + { + "epoch": 0.9277988101676582, + "grad_norm": 105.5, + "learning_rate": 0.018882655403688293, + "loss": 14.7165, + "mean_token_accuracy": 0.0031255639623850584, + "num_tokens": 1754190216.0, + "step": 3431 + }, + { + "epoch": 0.9280692266089778, + "grad_norm": 7.5625, + "learning_rate": 0.018881894496893602, + "loss": 3.973, + "mean_token_accuracy": 0.29324084520339966, + "num_tokens": 1754714313.0, + "step": 3432 + }, + { + "epoch": 0.9283396430502975, + "grad_norm": 2.453125, + "learning_rate": 0.018881133348259208, + "loss": 3.4987, + "mean_token_accuracy": 0.34646379947662354, + "num_tokens": 1755238591.0, + "step": 3433 + }, + { + "epoch": 0.9286100594916171, + "grad_norm": 5.46875, + "learning_rate": 0.018880371957808468, + "loss": 3.4315, + "mean_token_accuracy": 0.3887506127357483, + "num_tokens": 1755762826.0, + "step": 3434 + }, + { + "epoch": 0.9288804759329368, + "grad_norm": 2.875, + "learning_rate": 0.018879610325564743, + "loss": 3.4257, + "mean_token_accuracy": 0.36146116256713867, + "num_tokens": 1756286966.0, + "step": 3435 + }, + { + "epoch": 0.9291508923742564, + "grad_norm": 3.90625, + "learning_rate": 0.018878848451551397, + "loss": 3.605, + "mean_token_accuracy": 0.33263731002807617, + "num_tokens": 1756811067.0, + "step": 3436 + }, + { + "epoch": 0.9294213088155759, + "grad_norm": 3.234375, + "learning_rate": 0.01887808633579181, + "loss": 3.4694, + "mean_token_accuracy": 0.354776531457901, + "num_tokens": 1757287015.0, + "step": 3437 + }, + { + "epoch": 0.9296917252568956, + "grad_norm": 3.484375, + "learning_rate": 0.018877323978309372, + "loss": 3.6306, + "mean_token_accuracy": 0.36465901136398315, + "num_tokens": 1757717339.0, + "step": 3438 + }, + { + "epoch": 0.9299621416982152, + "grad_norm": 2.421875, + "learning_rate": 0.018876561379127462, + "loss": 3.2523, + "mean_token_accuracy": 0.37699902057647705, + "num_tokens": 1758202328.0, + "step": 3439 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 3.28125, + "learning_rate": 0.018875798538269494, + "loss": 3.5468, + "mean_token_accuracy": 0.36840587854385376, + "num_tokens": 1758716213.0, + "step": 3440 + }, + { + "epoch": 0.9305029745808545, + "grad_norm": 3.046875, + "learning_rate": 0.018875035455758856, + "loss": 3.4582, + "mean_token_accuracy": 0.38331151008605957, + "num_tokens": 1759240451.0, + "step": 3441 + }, + { + "epoch": 0.9307733910221742, + "grad_norm": 4.375, + "learning_rate": 0.018874272131618977, + "loss": 3.637, + "mean_token_accuracy": 0.32961505651474, + "num_tokens": 1759710047.0, + "step": 3442 + }, + { + "epoch": 0.9310438074634938, + "grad_norm": 2.625, + "learning_rate": 0.018873508565873272, + "loss": 3.4638, + "mean_token_accuracy": 0.3771727383136749, + "num_tokens": 1760234288.0, + "step": 3443 + }, + { + "epoch": 0.9313142239048134, + "grad_norm": 3.671875, + "learning_rate": 0.018872744758545168, + "loss": 3.5238, + "mean_token_accuracy": 0.35156190395355225, + "num_tokens": 1760758494.0, + "step": 3444 + }, + { + "epoch": 0.9315846403461331, + "grad_norm": 3.0625, + "learning_rate": 0.018871980709658103, + "loss": 3.102, + "mean_token_accuracy": 0.408868670463562, + "num_tokens": 1761223123.0, + "step": 3445 + }, + { + "epoch": 0.9318550567874527, + "grad_norm": 3.71875, + "learning_rate": 0.01887121641923552, + "loss": 3.2998, + "mean_token_accuracy": 0.35028916597366333, + "num_tokens": 1761747304.0, + "step": 3446 + }, + { + "epoch": 0.9321254732287723, + "grad_norm": 2.40625, + "learning_rate": 0.01887045188730087, + "loss": 3.5473, + "mean_token_accuracy": 0.3618279695510864, + "num_tokens": 1762271492.0, + "step": 3447 + }, + { + "epoch": 0.9323958896700919, + "grad_norm": 3.28125, + "learning_rate": 0.01886968711387761, + "loss": 3.5347, + "mean_token_accuracy": 0.36522188782691956, + "num_tokens": 1762795729.0, + "step": 3448 + }, + { + "epoch": 0.9326663061114115, + "grad_norm": 2.84375, + "learning_rate": 0.01886892209898921, + "loss": 3.288, + "mean_token_accuracy": 0.3875582814216614, + "num_tokens": 1763319962.0, + "step": 3449 + }, + { + "epoch": 0.9329367225527312, + "grad_norm": 2.953125, + "learning_rate": 0.01886815684265913, + "loss": 3.4009, + "mean_token_accuracy": 0.3730137348175049, + "num_tokens": 1763789564.0, + "step": 3450 + }, + { + "epoch": 0.9332071389940508, + "grad_norm": 92.0, + "learning_rate": 0.018867391344910868, + "loss": 11.9725, + "mean_token_accuracy": 0.004233930259943008, + "num_tokens": 1764313728.0, + "step": 3451 + }, + { + "epoch": 0.9334775554353705, + "grad_norm": 8.1875, + "learning_rate": 0.0188666256057679, + "loss": 3.9701, + "mean_token_accuracy": 0.30982547998428345, + "num_tokens": 1764816608.0, + "step": 3452 + }, + { + "epoch": 0.9337479718766901, + "grad_norm": 2.5, + "learning_rate": 0.018865859625253723, + "loss": 3.4552, + "mean_token_accuracy": 0.34264931082725525, + "num_tokens": 1765340695.0, + "step": 3453 + }, + { + "epoch": 0.9340183883180098, + "grad_norm": 2.828125, + "learning_rate": 0.01886509340339184, + "loss": 3.5612, + "mean_token_accuracy": 0.36629870533943176, + "num_tokens": 1765864858.0, + "step": 3454 + }, + { + "epoch": 0.9342888047593294, + "grad_norm": 2.890625, + "learning_rate": 0.018864326940205765, + "loss": 3.4228, + "mean_token_accuracy": 0.37499022483825684, + "num_tokens": 1766387853.0, + "step": 3455 + }, + { + "epoch": 0.934559221200649, + "grad_norm": 3.109375, + "learning_rate": 0.01886356023571901, + "loss": 3.542, + "mean_token_accuracy": 0.37009215354919434, + "num_tokens": 1766891906.0, + "step": 3456 + }, + { + "epoch": 0.9348296376419686, + "grad_norm": 2.828125, + "learning_rate": 0.0188627932899551, + "loss": 3.4706, + "mean_token_accuracy": 0.36513859033584595, + "num_tokens": 1767393127.0, + "step": 3457 + }, + { + "epoch": 0.9351000540832882, + "grad_norm": 3.953125, + "learning_rate": 0.018862026102937572, + "loss": 3.637, + "mean_token_accuracy": 0.3476811945438385, + "num_tokens": 1767917323.0, + "step": 3458 + }, + { + "epoch": 0.9353704705246079, + "grad_norm": 3.171875, + "learning_rate": 0.01886125867468996, + "loss": 3.2804, + "mean_token_accuracy": 0.38046252727508545, + "num_tokens": 1768404863.0, + "step": 3459 + }, + { + "epoch": 0.9356408869659275, + "grad_norm": 2.8125, + "learning_rate": 0.018860491005235815, + "loss": 3.1897, + "mean_token_accuracy": 0.3948870897293091, + "num_tokens": 1768893762.0, + "step": 3460 + }, + { + "epoch": 0.9359113034072472, + "grad_norm": 3.28125, + "learning_rate": 0.01885972309459869, + "loss": 3.357, + "mean_token_accuracy": 0.3623427152633667, + "num_tokens": 1769417889.0, + "step": 3461 + }, + { + "epoch": 0.9361817198485668, + "grad_norm": 3.875, + "learning_rate": 0.018858954942802146, + "loss": 3.3625, + "mean_token_accuracy": 0.39908474683761597, + "num_tokens": 1769931388.0, + "step": 3462 + }, + { + "epoch": 0.9364521362898864, + "grad_norm": 2.640625, + "learning_rate": 0.018858186549869753, + "loss": 3.2787, + "mean_token_accuracy": 0.39656156301498413, + "num_tokens": 1770455639.0, + "step": 3463 + }, + { + "epoch": 0.9367225527312061, + "grad_norm": 5.28125, + "learning_rate": 0.018857417915825087, + "loss": 3.5443, + "mean_token_accuracy": 0.3766823410987854, + "num_tokens": 1770920626.0, + "step": 3464 + }, + { + "epoch": 0.9369929691725257, + "grad_norm": 3.0, + "learning_rate": 0.01885664904069173, + "loss": 3.3153, + "mean_token_accuracy": 0.38317275047302246, + "num_tokens": 1771444891.0, + "step": 3465 + }, + { + "epoch": 0.9372633856138454, + "grad_norm": 4.40625, + "learning_rate": 0.018855879924493276, + "loss": 3.6671, + "mean_token_accuracy": 0.3260030746459961, + "num_tokens": 1771969138.0, + "step": 3466 + }, + { + "epoch": 0.9375338020551649, + "grad_norm": 2.390625, + "learning_rate": 0.01885511056725333, + "loss": 3.3889, + "mean_token_accuracy": 0.36243587732315063, + "num_tokens": 1772493293.0, + "step": 3467 + }, + { + "epoch": 0.9378042184964845, + "grad_norm": 3.0, + "learning_rate": 0.018854340968995485, + "loss": 3.5086, + "mean_token_accuracy": 0.3801506459712982, + "num_tokens": 1773017527.0, + "step": 3468 + }, + { + "epoch": 0.9380746349378042, + "grad_norm": 2.484375, + "learning_rate": 0.018853571129743365, + "loss": 3.4071, + "mean_token_accuracy": 0.37388864159584045, + "num_tokens": 1773496308.0, + "step": 3469 + }, + { + "epoch": 0.9383450513791238, + "grad_norm": 2.53125, + "learning_rate": 0.01885280104952058, + "loss": 3.3267, + "mean_token_accuracy": 0.3835909962654114, + "num_tokens": 1774020561.0, + "step": 3470 + }, + { + "epoch": 0.9386154678204435, + "grad_norm": 2.765625, + "learning_rate": 0.018852030728350767, + "loss": 10.0252, + "mean_token_accuracy": 0.0, + "num_tokens": 1774544530.0, + "step": 3471 + }, + { + "epoch": 0.9388858842617631, + "grad_norm": 8.5, + "learning_rate": 0.01885126016625757, + "loss": 4.1612, + "mean_token_accuracy": 0.2862635850906372, + "num_tokens": 1775068761.0, + "step": 3472 + }, + { + "epoch": 0.9391563007030828, + "grad_norm": 2.640625, + "learning_rate": 0.018850489363264607, + "loss": 3.4447, + "mean_token_accuracy": 0.3505575358867645, + "num_tokens": 1775592989.0, + "step": 3473 + }, + { + "epoch": 0.9394267171444024, + "grad_norm": 2.71875, + "learning_rate": 0.018849718319395553, + "loss": 3.2847, + "mean_token_accuracy": 0.36686891317367554, + "num_tokens": 1776117234.0, + "step": 3474 + }, + { + "epoch": 0.939697133585722, + "grad_norm": 3.125, + "learning_rate": 0.018848947034674057, + "loss": 3.5315, + "mean_token_accuracy": 0.3605479300022125, + "num_tokens": 1776641440.0, + "step": 3475 + }, + { + "epoch": 0.9399675500270417, + "grad_norm": 4.375, + "learning_rate": 0.01884817550912378, + "loss": 3.5009, + "mean_token_accuracy": 0.3641109764575958, + "num_tokens": 1777072917.0, + "step": 3476 + }, + { + "epoch": 0.9402379664683613, + "grad_norm": 3.1875, + "learning_rate": 0.018847403742768398, + "loss": 3.4223, + "mean_token_accuracy": 0.3612322211265564, + "num_tokens": 1777597186.0, + "step": 3477 + }, + { + "epoch": 0.9405083829096809, + "grad_norm": 3.0625, + "learning_rate": 0.018846631735631593, + "loss": 3.5012, + "mean_token_accuracy": 0.37522703409194946, + "num_tokens": 1778105395.0, + "step": 3478 + }, + { + "epoch": 0.9407787993510005, + "grad_norm": 3.109375, + "learning_rate": 0.01884585948773705, + "loss": 3.4809, + "mean_token_accuracy": 0.36102497577667236, + "num_tokens": 1778629587.0, + "step": 3479 + }, + { + "epoch": 0.9410492157923201, + "grad_norm": 2.9375, + "learning_rate": 0.018845086999108467, + "loss": 3.4105, + "mean_token_accuracy": 0.3572997748851776, + "num_tokens": 1779153805.0, + "step": 3480 + }, + { + "epoch": 0.9413196322336398, + "grad_norm": 2.921875, + "learning_rate": 0.01884431426976954, + "loss": 3.3485, + "mean_token_accuracy": 0.34001094102859497, + "num_tokens": 1779678033.0, + "step": 3481 + }, + { + "epoch": 0.9415900486749594, + "grad_norm": 2.28125, + "learning_rate": 0.01884354129974399, + "loss": 3.3946, + "mean_token_accuracy": 0.37024834752082825, + "num_tokens": 1780154827.0, + "step": 3482 + }, + { + "epoch": 0.9418604651162791, + "grad_norm": 3.421875, + "learning_rate": 0.01884276808905552, + "loss": 3.5915, + "mean_token_accuracy": 0.3610553741455078, + "num_tokens": 1780679110.0, + "step": 3483 + }, + { + "epoch": 0.9421308815575987, + "grad_norm": 2.5, + "learning_rate": 0.01884199463772786, + "loss": 3.3925, + "mean_token_accuracy": 0.3574356436729431, + "num_tokens": 1781203317.0, + "step": 3484 + }, + { + "epoch": 0.9424012979989184, + "grad_norm": 2.390625, + "learning_rate": 0.018841220945784748, + "loss": 3.2765, + "mean_token_accuracy": 0.39747536182403564, + "num_tokens": 1781702304.0, + "step": 3485 + }, + { + "epoch": 0.942671714440238, + "grad_norm": 2.765625, + "learning_rate": 0.018840447013249915, + "loss": 3.0571, + "mean_token_accuracy": 0.3966521620750427, + "num_tokens": 1782226370.0, + "step": 3486 + }, + { + "epoch": 0.9429421308815576, + "grad_norm": 2.515625, + "learning_rate": 0.01883967284014711, + "loss": 3.4222, + "mean_token_accuracy": 0.34770047664642334, + "num_tokens": 1782750600.0, + "step": 3487 + }, + { + "epoch": 0.9432125473228772, + "grad_norm": 3.03125, + "learning_rate": 0.01883889842650009, + "loss": 3.2555, + "mean_token_accuracy": 0.3765011727809906, + "num_tokens": 1783274703.0, + "step": 3488 + }, + { + "epoch": 0.9434829637641968, + "grad_norm": 2.859375, + "learning_rate": 0.01883812377233261, + "loss": 3.221, + "mean_token_accuracy": 0.39725345373153687, + "num_tokens": 1783798854.0, + "step": 3489 + }, + { + "epoch": 0.9437533802055165, + "grad_norm": 3.296875, + "learning_rate": 0.01883734887766844, + "loss": 3.6112, + "mean_token_accuracy": 0.3287675082683563, + "num_tokens": 1784323125.0, + "step": 3490 + }, + { + "epoch": 0.9440237966468361, + "grad_norm": 5.03125, + "learning_rate": 0.018836573742531365, + "loss": 10.3591, + "mean_token_accuracy": 5.027753104513977e-06, + "num_tokens": 1784847195.0, + "step": 3491 + }, + { + "epoch": 0.9442942130881558, + "grad_norm": 8.875, + "learning_rate": 0.01883579836694516, + "loss": 4.0755, + "mean_token_accuracy": 0.34290778636932373, + "num_tokens": 1785273674.0, + "step": 3492 + }, + { + "epoch": 0.9445646295294754, + "grad_norm": 2.046875, + "learning_rate": 0.018835022750933614, + "loss": 3.5295, + "mean_token_accuracy": 0.35655415058135986, + "num_tokens": 1785797939.0, + "step": 3493 + }, + { + "epoch": 0.944835045970795, + "grad_norm": 2.125, + "learning_rate": 0.018834246894520533, + "loss": 3.1863, + "mean_token_accuracy": 0.3712095022201538, + "num_tokens": 1786322022.0, + "step": 3494 + }, + { + "epoch": 0.9451054624121147, + "grad_norm": 3.265625, + "learning_rate": 0.018833470797729714, + "loss": 3.3654, + "mean_token_accuracy": 0.374281108379364, + "num_tokens": 1786809571.0, + "step": 3495 + }, + { + "epoch": 0.9453758788534343, + "grad_norm": 3.546875, + "learning_rate": 0.018832694460584977, + "loss": 3.399, + "mean_token_accuracy": 0.3762032687664032, + "num_tokens": 1787333850.0, + "step": 3496 + }, + { + "epoch": 0.945646295294754, + "grad_norm": 3.25, + "learning_rate": 0.01883191788311014, + "loss": 3.2935, + "mean_token_accuracy": 0.3675526976585388, + "num_tokens": 1787858126.0, + "step": 3497 + }, + { + "epoch": 0.9459167117360735, + "grad_norm": 3.078125, + "learning_rate": 0.01883114106532903, + "loss": 3.289, + "mean_token_accuracy": 0.39988481998443604, + "num_tokens": 1788382294.0, + "step": 3498 + }, + { + "epoch": 0.9461871281773931, + "grad_norm": 3.140625, + "learning_rate": 0.01883036400726548, + "loss": 3.3153, + "mean_token_accuracy": 0.36127030849456787, + "num_tokens": 1788906475.0, + "step": 3499 + }, + { + "epoch": 0.9464575446187128, + "grad_norm": 3.078125, + "learning_rate": 0.018829586708943337, + "loss": 3.0838, + "mean_token_accuracy": 0.391484797000885, + "num_tokens": 1789422679.0, + "step": 3500 + }, + { + "epoch": 0.9467279610600324, + "grad_norm": 3.015625, + "learning_rate": 0.018828809170386446, + "loss": 3.4987, + "mean_token_accuracy": 0.35044920444488525, + "num_tokens": 1789946791.0, + "step": 3501 + }, + { + "epoch": 0.9469983775013521, + "grad_norm": 3.03125, + "learning_rate": 0.01882803139161867, + "loss": 3.3923, + "mean_token_accuracy": 0.35117632150650024, + "num_tokens": 1790443399.0, + "step": 3502 + }, + { + "epoch": 0.9472687939426717, + "grad_norm": 2.5625, + "learning_rate": 0.018827253372663867, + "loss": 3.4017, + "mean_token_accuracy": 0.37638550996780396, + "num_tokens": 1790949170.0, + "step": 3503 + }, + { + "epoch": 0.9475392103839914, + "grad_norm": 3.65625, + "learning_rate": 0.018826475113545914, + "loss": 3.5367, + "mean_token_accuracy": 0.3504851460456848, + "num_tokens": 1791473271.0, + "step": 3504 + }, + { + "epoch": 0.947809626825311, + "grad_norm": 2.921875, + "learning_rate": 0.018825696614288685, + "loss": 3.3288, + "mean_token_accuracy": 0.3877270221710205, + "num_tokens": 1791997400.0, + "step": 3505 + }, + { + "epoch": 0.9480800432666306, + "grad_norm": 3.625, + "learning_rate": 0.018824917874916074, + "loss": 3.4585, + "mean_token_accuracy": 0.36972561478614807, + "num_tokens": 1792521604.0, + "step": 3506 + }, + { + "epoch": 0.9483504597079503, + "grad_norm": 3.046875, + "learning_rate": 0.01882413889545197, + "loss": 3.4303, + "mean_token_accuracy": 0.3991028368473053, + "num_tokens": 1792995898.0, + "step": 3507 + }, + { + "epoch": 0.9486208761492698, + "grad_norm": 2.828125, + "learning_rate": 0.018823359675920276, + "loss": 3.2692, + "mean_token_accuracy": 0.35497739911079407, + "num_tokens": 1793520055.0, + "step": 3508 + }, + { + "epoch": 0.9488912925905895, + "grad_norm": 24.375, + "learning_rate": 0.0188225802163449, + "loss": 3.2517, + "mean_token_accuracy": 0.374221533536911, + "num_tokens": 1794008315.0, + "step": 3509 + }, + { + "epoch": 0.9491617090319091, + "grad_norm": 3.5625, + "learning_rate": 0.018821800516749754, + "loss": 3.5781, + "mean_token_accuracy": 0.34757256507873535, + "num_tokens": 1794532582.0, + "step": 3510 + }, + { + "epoch": 0.9494321254732287, + "grad_norm": 30.75, + "learning_rate": 0.018821020577158773, + "loss": 22.2977, + "mean_token_accuracy": 0.03346799314022064, + "num_tokens": 1794995539.0, + "step": 3511 + }, + { + "epoch": 0.9497025419145484, + "grad_norm": 14.25, + "learning_rate": 0.018820240397595875, + "loss": 4.0898, + "mean_token_accuracy": 0.30381113290786743, + "num_tokens": 1795519736.0, + "step": 3512 + }, + { + "epoch": 0.949972958355868, + "grad_norm": 3.515625, + "learning_rate": 0.018819459978085006, + "loss": 3.648, + "mean_token_accuracy": 0.3650180399417877, + "num_tokens": 1795981428.0, + "step": 3513 + }, + { + "epoch": 0.9502433747971877, + "grad_norm": 2.9375, + "learning_rate": 0.018818679318650108, + "loss": 3.7019, + "mean_token_accuracy": 0.35947033762931824, + "num_tokens": 1796407699.0, + "step": 3514 + }, + { + "epoch": 0.9505137912385073, + "grad_norm": 4.21875, + "learning_rate": 0.018817898419315134, + "loss": 3.4958, + "mean_token_accuracy": 0.3518305718898773, + "num_tokens": 1796931879.0, + "step": 3515 + }, + { + "epoch": 0.950784207679827, + "grad_norm": 4.0625, + "learning_rate": 0.01881711728010405, + "loss": 3.5404, + "mean_token_accuracy": 0.37573492527008057, + "num_tokens": 1797392470.0, + "step": 3516 + }, + { + "epoch": 0.9510546241211466, + "grad_norm": 3.140625, + "learning_rate": 0.018816335901040815, + "loss": 3.7143, + "mean_token_accuracy": 0.34096667170524597, + "num_tokens": 1797869297.0, + "step": 3517 + }, + { + "epoch": 0.9513250405624663, + "grad_norm": 2.796875, + "learning_rate": 0.018815554282149413, + "loss": 3.7134, + "mean_token_accuracy": 0.3394208252429962, + "num_tokens": 1798393539.0, + "step": 3518 + }, + { + "epoch": 0.9515954570037858, + "grad_norm": 2.46875, + "learning_rate": 0.018814772423453814, + "loss": 3.3433, + "mean_token_accuracy": 0.37536680698394775, + "num_tokens": 1798870733.0, + "step": 3519 + }, + { + "epoch": 0.9518658734451054, + "grad_norm": 2.9375, + "learning_rate": 0.018813990324978023, + "loss": 3.2188, + "mean_token_accuracy": 0.3804926872253418, + "num_tokens": 1799395018.0, + "step": 3520 + }, + { + "epoch": 0.9521362898864251, + "grad_norm": 2.640625, + "learning_rate": 0.018813207986746027, + "loss": 3.4816, + "mean_token_accuracy": 0.3760513961315155, + "num_tokens": 1799919190.0, + "step": 3521 + }, + { + "epoch": 0.9524067063277447, + "grad_norm": 2.671875, + "learning_rate": 0.018812425408781832, + "loss": 3.1589, + "mean_token_accuracy": 0.3842635750770569, + "num_tokens": 1800443472.0, + "step": 3522 + }, + { + "epoch": 0.9526771227690644, + "grad_norm": 3.125, + "learning_rate": 0.01881164259110945, + "loss": 3.2712, + "mean_token_accuracy": 0.3770638704299927, + "num_tokens": 1800967697.0, + "step": 3523 + }, + { + "epoch": 0.952947539210384, + "grad_norm": 3.25, + "learning_rate": 0.0188108595337529, + "loss": 3.4737, + "mean_token_accuracy": 0.37607496976852417, + "num_tokens": 1801491769.0, + "step": 3524 + }, + { + "epoch": 0.9532179556517036, + "grad_norm": 3.359375, + "learning_rate": 0.018810076236736212, + "loss": 3.2893, + "mean_token_accuracy": 0.3653382658958435, + "num_tokens": 1802015966.0, + "step": 3525 + }, + { + "epoch": 0.9534883720930233, + "grad_norm": 2.90625, + "learning_rate": 0.018809292700083417, + "loss": 3.3477, + "mean_token_accuracy": 0.3781247138977051, + "num_tokens": 1802518203.0, + "step": 3526 + }, + { + "epoch": 0.9537587885343429, + "grad_norm": 2.71875, + "learning_rate": 0.018808508923818554, + "loss": 3.3159, + "mean_token_accuracy": 0.37771695852279663, + "num_tokens": 1803042465.0, + "step": 3527 + }, + { + "epoch": 0.9540292049756626, + "grad_norm": 2.65625, + "learning_rate": 0.018807724907965676, + "loss": 3.331, + "mean_token_accuracy": 0.36111289262771606, + "num_tokens": 1803566695.0, + "step": 3528 + }, + { + "epoch": 0.9542996214169821, + "grad_norm": 3.0, + "learning_rate": 0.018806940652548836, + "loss": 3.4503, + "mean_token_accuracy": 0.3797306716442108, + "num_tokens": 1804090763.0, + "step": 3529 + }, + { + "epoch": 0.9545700378583017, + "grad_norm": 2.828125, + "learning_rate": 0.018806156157592097, + "loss": 3.1965, + "mean_token_accuracy": 0.39784353971481323, + "num_tokens": 1804615012.0, + "step": 3530 + }, + { + "epoch": 0.9548404542996214, + "grad_norm": 4.1875, + "learning_rate": 0.018805371423119534, + "loss": 11.0175, + "mean_token_accuracy": 4.483136581256986e-05, + "num_tokens": 1805139229.0, + "step": 3531 + }, + { + "epoch": 0.955110870740941, + "grad_norm": 10.4375, + "learning_rate": 0.018804586449155218, + "loss": 4.2195, + "mean_token_accuracy": 0.25582072138786316, + "num_tokens": 1805663501.0, + "step": 3532 + }, + { + "epoch": 0.9553812871822607, + "grad_norm": 22.0, + "learning_rate": 0.018803801235723238, + "loss": 3.288, + "mean_token_accuracy": 0.3897334039211273, + "num_tokens": 1806187781.0, + "step": 3533 + }, + { + "epoch": 0.9556517036235803, + "grad_norm": 3.265625, + "learning_rate": 0.01880301578284769, + "loss": 3.6443, + "mean_token_accuracy": 0.35536834597587585, + "num_tokens": 1806704534.0, + "step": 3534 + }, + { + "epoch": 0.9559221200649, + "grad_norm": 3.46875, + "learning_rate": 0.018802230090552666, + "loss": 3.6937, + "mean_token_accuracy": 0.3452375531196594, + "num_tokens": 1807228768.0, + "step": 3535 + }, + { + "epoch": 0.9561925365062196, + "grad_norm": 4.84375, + "learning_rate": 0.01880144415886228, + "loss": 3.7042, + "mean_token_accuracy": 0.3552151918411255, + "num_tokens": 1807718315.0, + "step": 3536 + }, + { + "epoch": 0.9564629529475392, + "grad_norm": 3.6875, + "learning_rate": 0.018800657987800645, + "loss": 3.3529, + "mean_token_accuracy": 0.36405056715011597, + "num_tokens": 1808188294.0, + "step": 3537 + }, + { + "epoch": 0.9567333693888589, + "grad_norm": 3.46875, + "learning_rate": 0.018799871577391884, + "loss": 3.621, + "mean_token_accuracy": 0.3571566045284271, + "num_tokens": 1808656973.0, + "step": 3538 + }, + { + "epoch": 0.9570037858301784, + "grad_norm": 3.984375, + "learning_rate": 0.018799084927660123, + "loss": 3.4433, + "mean_token_accuracy": 0.3546083867549896, + "num_tokens": 1809181171.0, + "step": 3539 + }, + { + "epoch": 0.9572742022714981, + "grad_norm": 2.953125, + "learning_rate": 0.018798298038629504, + "loss": 3.6629, + "mean_token_accuracy": 0.3253205418586731, + "num_tokens": 1809705440.0, + "step": 3540 + }, + { + "epoch": 0.9575446187128177, + "grad_norm": 5.03125, + "learning_rate": 0.018797510910324163, + "loss": 3.669, + "mean_token_accuracy": 0.3437517583370209, + "num_tokens": 1810229667.0, + "step": 3541 + }, + { + "epoch": 0.9578150351541374, + "grad_norm": 3.03125, + "learning_rate": 0.01879672354276826, + "loss": 3.4877, + "mean_token_accuracy": 0.3690038323402405, + "num_tokens": 1810753879.0, + "step": 3542 + }, + { + "epoch": 0.958085451595457, + "grad_norm": 2.890625, + "learning_rate": 0.018795935935985948, + "loss": 3.2991, + "mean_token_accuracy": 0.37214046716690063, + "num_tokens": 1811278104.0, + "step": 3543 + }, + { + "epoch": 0.9583558680367766, + "grad_norm": 2.453125, + "learning_rate": 0.018795148090001396, + "loss": 3.3839, + "mean_token_accuracy": 0.35334479808807373, + "num_tokens": 1811802338.0, + "step": 3544 + }, + { + "epoch": 0.9586262844780963, + "grad_norm": 3.3125, + "learning_rate": 0.018794360004838778, + "loss": 3.5681, + "mean_token_accuracy": 0.3628767132759094, + "num_tokens": 1812326624.0, + "step": 3545 + }, + { + "epoch": 0.9588967009194159, + "grad_norm": 2.34375, + "learning_rate": 0.018793571680522272, + "loss": 3.3966, + "mean_token_accuracy": 0.3868790864944458, + "num_tokens": 1812822234.0, + "step": 3546 + }, + { + "epoch": 0.9591671173607356, + "grad_norm": 2.328125, + "learning_rate": 0.018792783117076068, + "loss": 3.2331, + "mean_token_accuracy": 0.38198310136795044, + "num_tokens": 1813346433.0, + "step": 3547 + }, + { + "epoch": 0.9594375338020552, + "grad_norm": 2.3125, + "learning_rate": 0.018791994314524357, + "loss": 3.3196, + "mean_token_accuracy": 0.3929380178451538, + "num_tokens": 1813870614.0, + "step": 3548 + }, + { + "epoch": 0.9597079502433749, + "grad_norm": 3.203125, + "learning_rate": 0.01879120527289135, + "loss": 3.3063, + "mean_token_accuracy": 0.3880481719970703, + "num_tokens": 1814394825.0, + "step": 3549 + }, + { + "epoch": 0.9599783666846944, + "grad_norm": 2.640625, + "learning_rate": 0.018790415992201253, + "loss": 3.2474, + "mean_token_accuracy": 0.35600146651268005, + "num_tokens": 1814918936.0, + "step": 3550 + }, + { + "epoch": 0.960248783126014, + "grad_norm": 72.5, + "learning_rate": 0.018789626472478282, + "loss": 21.2931, + "mean_token_accuracy": 0.012969061732292175, + "num_tokens": 1815412870.0, + "step": 3551 + }, + { + "epoch": 0.9605191995673337, + "grad_norm": 8.6875, + "learning_rate": 0.018788836713746662, + "loss": 4.0014, + "mean_token_accuracy": 0.33760541677474976, + "num_tokens": 1815920950.0, + "step": 3552 + }, + { + "epoch": 0.9607896160086533, + "grad_norm": 4.3125, + "learning_rate": 0.018788046716030626, + "loss": 3.7002, + "mean_token_accuracy": 0.34325969219207764, + "num_tokens": 1816445172.0, + "step": 3553 + }, + { + "epoch": 0.961060032449973, + "grad_norm": 3.296875, + "learning_rate": 0.018787256479354413, + "loss": 3.6078, + "mean_token_accuracy": 0.35123342275619507, + "num_tokens": 1816969308.0, + "step": 3554 + }, + { + "epoch": 0.9613304488912926, + "grad_norm": 3.515625, + "learning_rate": 0.018786466003742276, + "loss": 3.6616, + "mean_token_accuracy": 0.3446655571460724, + "num_tokens": 1817493584.0, + "step": 3555 + }, + { + "epoch": 0.9616008653326122, + "grad_norm": 3.203125, + "learning_rate": 0.018785675289218456, + "loss": 3.4473, + "mean_token_accuracy": 0.3583812117576599, + "num_tokens": 1817980137.0, + "step": 3556 + }, + { + "epoch": 0.9618712817739319, + "grad_norm": 2.671875, + "learning_rate": 0.018784884335807227, + "loss": 3.3175, + "mean_token_accuracy": 0.376011461019516, + "num_tokens": 1818504217.0, + "step": 3557 + }, + { + "epoch": 0.9621416982152515, + "grad_norm": 2.390625, + "learning_rate": 0.01878409314353285, + "loss": 3.2971, + "mean_token_accuracy": 0.37959951162338257, + "num_tokens": 1819028475.0, + "step": 3558 + }, + { + "epoch": 0.9624121146565712, + "grad_norm": 2.6875, + "learning_rate": 0.018783301712419605, + "loss": 3.1622, + "mean_token_accuracy": 0.3741884231567383, + "num_tokens": 1819552704.0, + "step": 3559 + }, + { + "epoch": 0.9626825310978907, + "grad_norm": 7.8125, + "learning_rate": 0.018782510042491776, + "loss": 3.1785, + "mean_token_accuracy": 0.39059802889823914, + "num_tokens": 1820076778.0, + "step": 3560 + }, + { + "epoch": 0.9629529475392103, + "grad_norm": 6.65625, + "learning_rate": 0.01878171813377365, + "loss": 3.4628, + "mean_token_accuracy": 0.37320879101753235, + "num_tokens": 1820579670.0, + "step": 3561 + }, + { + "epoch": 0.96322336398053, + "grad_norm": 1.5, + "learning_rate": 0.018780925986289526, + "loss": 3.4258, + "mean_token_accuracy": 0.3663787245750427, + "num_tokens": 1821103905.0, + "step": 3562 + }, + { + "epoch": 0.9634937804218496, + "grad_norm": 2.78125, + "learning_rate": 0.018780133600063715, + "loss": 3.3251, + "mean_token_accuracy": 0.37383025884628296, + "num_tokens": 1821628133.0, + "step": 3563 + }, + { + "epoch": 0.9637641968631693, + "grad_norm": 2.90625, + "learning_rate": 0.01877934097512052, + "loss": 3.242, + "mean_token_accuracy": 0.39259830117225647, + "num_tokens": 1822152356.0, + "step": 3564 + }, + { + "epoch": 0.9640346133044889, + "grad_norm": 5.6875, + "learning_rate": 0.01877854811148427, + "loss": 3.1853, + "mean_token_accuracy": 0.3892413377761841, + "num_tokens": 1822641629.0, + "step": 3565 + }, + { + "epoch": 0.9643050297458086, + "grad_norm": 2.078125, + "learning_rate": 0.018777755009179286, + "loss": 3.5464, + "mean_token_accuracy": 0.34327322244644165, + "num_tokens": 1823165694.0, + "step": 3566 + }, + { + "epoch": 0.9645754461871282, + "grad_norm": 2.390625, + "learning_rate": 0.018776961668229907, + "loss": 3.2534, + "mean_token_accuracy": 0.38719409704208374, + "num_tokens": 1823689884.0, + "step": 3567 + }, + { + "epoch": 0.9648458626284478, + "grad_norm": 2.9375, + "learning_rate": 0.018776168088660474, + "loss": 3.3426, + "mean_token_accuracy": 0.38575878739356995, + "num_tokens": 1824214163.0, + "step": 3568 + }, + { + "epoch": 0.9651162790697675, + "grad_norm": 3.4375, + "learning_rate": 0.018775374270495332, + "loss": 3.5696, + "mean_token_accuracy": 0.36755838990211487, + "num_tokens": 1824738413.0, + "step": 3569 + }, + { + "epoch": 0.965386695511087, + "grad_norm": 3.375, + "learning_rate": 0.018774580213758846, + "loss": 3.5938, + "mean_token_accuracy": 0.3469699025154114, + "num_tokens": 1825262688.0, + "step": 3570 + }, + { + "epoch": 0.9656571119524067, + "grad_norm": 42.25, + "learning_rate": 0.018773785918475373, + "loss": 11.4185, + "mean_token_accuracy": 0.004162908066064119, + "num_tokens": 1825769950.0, + "step": 3571 + }, + { + "epoch": 0.9659275283937263, + "grad_norm": 6.25, + "learning_rate": 0.018772991384669292, + "loss": 3.7672, + "mean_token_accuracy": 0.3276151120662689, + "num_tokens": 1826294090.0, + "step": 3572 + }, + { + "epoch": 0.966197944835046, + "grad_norm": 2.46875, + "learning_rate": 0.01877219661236497, + "loss": 3.5936, + "mean_token_accuracy": 0.3578561842441559, + "num_tokens": 1826818353.0, + "step": 3573 + }, + { + "epoch": 0.9664683612763656, + "grad_norm": 4.34375, + "learning_rate": 0.018771401601586798, + "loss": 3.6288, + "mean_token_accuracy": 0.37191352248191833, + "num_tokens": 1827240180.0, + "step": 3574 + }, + { + "epoch": 0.9667387777176852, + "grad_norm": 2.703125, + "learning_rate": 0.01877060635235918, + "loss": 3.4205, + "mean_token_accuracy": 0.38788706064224243, + "num_tokens": 1827704230.0, + "step": 3575 + }, + { + "epoch": 0.9670091941590049, + "grad_norm": 3.265625, + "learning_rate": 0.0187698108647065, + "loss": 3.3914, + "mean_token_accuracy": 0.3750514090061188, + "num_tokens": 1828228490.0, + "step": 3576 + }, + { + "epoch": 0.9672796106003245, + "grad_norm": 2.953125, + "learning_rate": 0.018769015138653172, + "loss": 3.3665, + "mean_token_accuracy": 0.37245020270347595, + "num_tokens": 1828752765.0, + "step": 3577 + }, + { + "epoch": 0.9675500270416442, + "grad_norm": 2.546875, + "learning_rate": 0.018768219174223617, + "loss": 3.4066, + "mean_token_accuracy": 0.38487738370895386, + "num_tokens": 1829254106.0, + "step": 3578 + }, + { + "epoch": 0.9678204434829638, + "grad_norm": 3.34375, + "learning_rate": 0.018767422971442252, + "loss": 3.5347, + "mean_token_accuracy": 0.35473084449768066, + "num_tokens": 1829729704.0, + "step": 3579 + }, + { + "epoch": 0.9680908599242833, + "grad_norm": 2.421875, + "learning_rate": 0.018766626530333504, + "loss": 3.2446, + "mean_token_accuracy": 0.38559216260910034, + "num_tokens": 1830242295.0, + "step": 3580 + }, + { + "epoch": 0.968361276365603, + "grad_norm": 3.609375, + "learning_rate": 0.018765829850921817, + "loss": 3.266, + "mean_token_accuracy": 0.3746352791786194, + "num_tokens": 1830766344.0, + "step": 3581 + }, + { + "epoch": 0.9686316928069226, + "grad_norm": 3.28125, + "learning_rate": 0.018765032933231635, + "loss": 3.3525, + "mean_token_accuracy": 0.3778958320617676, + "num_tokens": 1831290549.0, + "step": 3582 + }, + { + "epoch": 0.9689021092482423, + "grad_norm": 3.015625, + "learning_rate": 0.018764235777287402, + "loss": 3.2239, + "mean_token_accuracy": 0.3951757848262787, + "num_tokens": 1831814769.0, + "step": 3583 + }, + { + "epoch": 0.9691725256895619, + "grad_norm": 3.75, + "learning_rate": 0.018763438383113586, + "loss": 3.343, + "mean_token_accuracy": 0.37153568863868713, + "num_tokens": 1832339044.0, + "step": 3584 + }, + { + "epoch": 0.9694429421308816, + "grad_norm": 3.59375, + "learning_rate": 0.018762640750734645, + "loss": 3.386, + "mean_token_accuracy": 0.37193742394447327, + "num_tokens": 1832863298.0, + "step": 3585 + }, + { + "epoch": 0.9697133585722012, + "grad_norm": 3.171875, + "learning_rate": 0.018761842880175062, + "loss": 3.3769, + "mean_token_accuracy": 0.37103211879730225, + "num_tokens": 1833354672.0, + "step": 3586 + }, + { + "epoch": 0.9699837750135208, + "grad_norm": 2.9375, + "learning_rate": 0.01876104477145931, + "loss": 3.0611, + "mean_token_accuracy": 0.42334839701652527, + "num_tokens": 1833836610.0, + "step": 3587 + }, + { + "epoch": 0.9702541914548405, + "grad_norm": 2.5625, + "learning_rate": 0.01876024642461188, + "loss": 3.5241, + "mean_token_accuracy": 0.35814571380615234, + "num_tokens": 1834360769.0, + "step": 3588 + }, + { + "epoch": 0.9705246078961601, + "grad_norm": 4.75, + "learning_rate": 0.018759447839657273, + "loss": 3.6745, + "mean_token_accuracy": 0.33640193939208984, + "num_tokens": 1834884951.0, + "step": 3589 + }, + { + "epoch": 0.9707950243374798, + "grad_norm": 2.625, + "learning_rate": 0.018758649016619985, + "loss": 3.2523, + "mean_token_accuracy": 0.37308457493782043, + "num_tokens": 1835408962.0, + "step": 3590 + }, + { + "epoch": 0.9710654407787993, + "grad_norm": 24.0, + "learning_rate": 0.018757849955524528, + "loss": 11.1797, + "mean_token_accuracy": 0.009683476760983467, + "num_tokens": 1835904003.0, + "step": 3591 + }, + { + "epoch": 0.971335857220119, + "grad_norm": 8.8125, + "learning_rate": 0.01875705065639542, + "loss": 3.9811, + "mean_token_accuracy": 0.3054425120353699, + "num_tokens": 1836372076.0, + "step": 3592 + }, + { + "epoch": 0.9716062736614386, + "grad_norm": 2.546875, + "learning_rate": 0.018756251119257186, + "loss": 3.5551, + "mean_token_accuracy": 0.3600265383720398, + "num_tokens": 1836896266.0, + "step": 3593 + }, + { + "epoch": 0.9718766901027582, + "grad_norm": 2.5, + "learning_rate": 0.01875545134413436, + "loss": 3.4362, + "mean_token_accuracy": 0.35449689626693726, + "num_tokens": 1837420430.0, + "step": 3594 + }, + { + "epoch": 0.9721471065440779, + "grad_norm": 3.375, + "learning_rate": 0.018754651331051475, + "loss": 3.1596, + "mean_token_accuracy": 0.3969004452228546, + "num_tokens": 1837944611.0, + "step": 3595 + }, + { + "epoch": 0.9724175229853975, + "grad_norm": 2.484375, + "learning_rate": 0.018753851080033083, + "loss": 3.2772, + "mean_token_accuracy": 0.37971967458724976, + "num_tokens": 1838468756.0, + "step": 3596 + }, + { + "epoch": 0.9726879394267172, + "grad_norm": 3.03125, + "learning_rate": 0.018753050591103743, + "loss": 3.534, + "mean_token_accuracy": 0.36170095205307007, + "num_tokens": 1838938775.0, + "step": 3597 + }, + { + "epoch": 0.9729583558680368, + "grad_norm": 2.65625, + "learning_rate": 0.018752249864288008, + "loss": 3.2593, + "mean_token_accuracy": 0.3948623538017273, + "num_tokens": 1839462984.0, + "step": 3598 + }, + { + "epoch": 0.9732287723093564, + "grad_norm": 2.8125, + "learning_rate": 0.018751448899610446, + "loss": 3.4912, + "mean_token_accuracy": 0.3668435215950012, + "num_tokens": 1839987229.0, + "step": 3599 + }, + { + "epoch": 0.9734991887506761, + "grad_norm": 3.484375, + "learning_rate": 0.018750647697095637, + "loss": 3.4996, + "mean_token_accuracy": 0.3544178605079651, + "num_tokens": 1840511507.0, + "step": 3600 + }, + { + "epoch": 0.9737696051919956, + "grad_norm": 4.53125, + "learning_rate": 0.018749846256768166, + "loss": 3.2974, + "mean_token_accuracy": 0.3471062183380127, + "num_tokens": 1841035741.0, + "step": 3601 + }, + { + "epoch": 0.9740400216333153, + "grad_norm": 2.796875, + "learning_rate": 0.01874904457865262, + "loss": 3.3814, + "mean_token_accuracy": 0.3608378767967224, + "num_tokens": 1841559966.0, + "step": 3602 + }, + { + "epoch": 0.9743104380746349, + "grad_norm": 3.765625, + "learning_rate": 0.018748242662773598, + "loss": 3.3451, + "mean_token_accuracy": 0.36083126068115234, + "num_tokens": 1842084162.0, + "step": 3603 + }, + { + "epoch": 0.9745808545159546, + "grad_norm": 2.890625, + "learning_rate": 0.018747440509155704, + "loss": 3.458, + "mean_token_accuracy": 0.3740738034248352, + "num_tokens": 1842583456.0, + "step": 3604 + }, + { + "epoch": 0.9748512709572742, + "grad_norm": 3.171875, + "learning_rate": 0.018746638117823552, + "loss": 3.4114, + "mean_token_accuracy": 0.38319483399391174, + "num_tokens": 1843107652.0, + "step": 3605 + }, + { + "epoch": 0.9751216873985938, + "grad_norm": 2.890625, + "learning_rate": 0.018745835488801762, + "loss": 3.407, + "mean_token_accuracy": 0.3502568006515503, + "num_tokens": 1843631895.0, + "step": 3606 + }, + { + "epoch": 0.9753921038399135, + "grad_norm": 2.828125, + "learning_rate": 0.01874503262211496, + "loss": 3.3637, + "mean_token_accuracy": 0.36211416125297546, + "num_tokens": 1844156049.0, + "step": 3607 + }, + { + "epoch": 0.9756625202812331, + "grad_norm": 2.578125, + "learning_rate": 0.01874422951778778, + "loss": 3.3666, + "mean_token_accuracy": 0.3595064580440521, + "num_tokens": 1844680307.0, + "step": 3608 + }, + { + "epoch": 0.9759329367225528, + "grad_norm": 2.875, + "learning_rate": 0.018743426175844864, + "loss": 3.2212, + "mean_token_accuracy": 0.39645564556121826, + "num_tokens": 1845150626.0, + "step": 3609 + }, + { + "epoch": 0.9762033531638724, + "grad_norm": 3.453125, + "learning_rate": 0.01874262259631086, + "loss": 3.2409, + "mean_token_accuracy": 0.37507572770118713, + "num_tokens": 1845674889.0, + "step": 3610 + }, + { + "epoch": 0.9764737696051919, + "grad_norm": 1.171875, + "learning_rate": 0.018741818779210426, + "loss": 11.1007, + "mean_token_accuracy": 5.647802936437074e-06, + "num_tokens": 1846198959.0, + "step": 3611 + }, + { + "epoch": 0.9767441860465116, + "grad_norm": 7.75, + "learning_rate": 0.018741014724568225, + "loss": 4.0197, + "mean_token_accuracy": 0.28057679533958435, + "num_tokens": 1846723227.0, + "step": 3612 + }, + { + "epoch": 0.9770146024878312, + "grad_norm": 2.75, + "learning_rate": 0.018740210432408925, + "loss": 3.62, + "mean_token_accuracy": 0.35013845562934875, + "num_tokens": 1847222379.0, + "step": 3613 + }, + { + "epoch": 0.9772850189291509, + "grad_norm": 3.125, + "learning_rate": 0.01873940590275721, + "loss": 3.4807, + "mean_token_accuracy": 0.3799547851085663, + "num_tokens": 1847746550.0, + "step": 3614 + }, + { + "epoch": 0.9775554353704705, + "grad_norm": 2.90625, + "learning_rate": 0.018738601135637762, + "loss": 3.2776, + "mean_token_accuracy": 0.3715971112251282, + "num_tokens": 1848270744.0, + "step": 3615 + }, + { + "epoch": 0.9778258518117902, + "grad_norm": 3.5, + "learning_rate": 0.01873779613107527, + "loss": 3.6065, + "mean_token_accuracy": 0.36392906308174133, + "num_tokens": 1848795020.0, + "step": 3616 + }, + { + "epoch": 0.9780962682531098, + "grad_norm": 3.234375, + "learning_rate": 0.01873699088909444, + "loss": 3.3981, + "mean_token_accuracy": 0.3599404990673065, + "num_tokens": 1849319282.0, + "step": 3617 + }, + { + "epoch": 0.9783666846944294, + "grad_norm": 2.84375, + "learning_rate": 0.018736185409719976, + "loss": 3.4273, + "mean_token_accuracy": 0.36503180861473083, + "num_tokens": 1849840651.0, + "step": 3618 + }, + { + "epoch": 0.9786371011357491, + "grad_norm": 2.578125, + "learning_rate": 0.018735379692976593, + "loss": 3.4398, + "mean_token_accuracy": 0.3635287284851074, + "num_tokens": 1850364931.0, + "step": 3619 + }, + { + "epoch": 0.9789075175770687, + "grad_norm": 2.921875, + "learning_rate": 0.01873457373888901, + "loss": 3.3939, + "mean_token_accuracy": 0.38165637850761414, + "num_tokens": 1850889203.0, + "step": 3620 + }, + { + "epoch": 0.9791779340183883, + "grad_norm": 2.96875, + "learning_rate": 0.018733767547481964, + "loss": 3.3657, + "mean_token_accuracy": 0.39855682849884033, + "num_tokens": 1851413242.0, + "step": 3621 + }, + { + "epoch": 0.9794483504597079, + "grad_norm": 3.46875, + "learning_rate": 0.01873296111878018, + "loss": 3.5837, + "mean_token_accuracy": 0.36426669359207153, + "num_tokens": 1851874216.0, + "step": 3622 + }, + { + "epoch": 0.9797187669010275, + "grad_norm": 3.125, + "learning_rate": 0.018732154452808413, + "loss": 3.2655, + "mean_token_accuracy": 0.3796209692955017, + "num_tokens": 1852398413.0, + "step": 3623 + }, + { + "epoch": 0.9799891833423472, + "grad_norm": 3.359375, + "learning_rate": 0.018731347549591406, + "loss": 3.4055, + "mean_token_accuracy": 0.35041528940200806, + "num_tokens": 1852881133.0, + "step": 3624 + }, + { + "epoch": 0.9802595997836668, + "grad_norm": 2.71875, + "learning_rate": 0.018730540409153923, + "loss": 3.3956, + "mean_token_accuracy": 0.37269270420074463, + "num_tokens": 1853405400.0, + "step": 3625 + }, + { + "epoch": 0.9805300162249865, + "grad_norm": 2.796875, + "learning_rate": 0.018729733031520722, + "loss": 3.353, + "mean_token_accuracy": 0.37676548957824707, + "num_tokens": 1853929551.0, + "step": 3626 + }, + { + "epoch": 0.9808004326663061, + "grad_norm": 2.4375, + "learning_rate": 0.018728925416716582, + "loss": 3.2015, + "mean_token_accuracy": 0.39594289660453796, + "num_tokens": 1854453815.0, + "step": 3627 + }, + { + "epoch": 0.9810708491076258, + "grad_norm": 3.421875, + "learning_rate": 0.01872811756476628, + "loss": 3.1861, + "mean_token_accuracy": 0.38761740922927856, + "num_tokens": 1854942206.0, + "step": 3628 + }, + { + "epoch": 0.9813412655489454, + "grad_norm": 3.421875, + "learning_rate": 0.018727309475694606, + "loss": 3.1488, + "mean_token_accuracy": 0.40643730759620667, + "num_tokens": 1855466360.0, + "step": 3629 + }, + { + "epoch": 0.981611681990265, + "grad_norm": 3.0, + "learning_rate": 0.018726501149526353, + "loss": 3.4923, + "mean_token_accuracy": 0.3637840449810028, + "num_tokens": 1855990611.0, + "step": 3630 + }, + { + "epoch": 0.9818820984315847, + "grad_norm": 129.0, + "learning_rate": 0.018725692586286317, + "loss": 12.4882, + "mean_token_accuracy": 0.032967302948236465, + "num_tokens": 1856466340.0, + "step": 3631 + }, + { + "epoch": 0.9821525148729042, + "grad_norm": 8.9375, + "learning_rate": 0.01872488378599932, + "loss": 3.9694, + "mean_token_accuracy": 0.31685343384742737, + "num_tokens": 1856990546.0, + "step": 3632 + }, + { + "epoch": 0.9824229313142239, + "grad_norm": 34.5, + "learning_rate": 0.018724074748690166, + "loss": 3.7525, + "mean_token_accuracy": 0.3753127455711365, + "num_tokens": 1857514806.0, + "step": 3633 + }, + { + "epoch": 0.9826933477555435, + "grad_norm": 3.90625, + "learning_rate": 0.018723265474383684, + "loss": 3.6565, + "mean_token_accuracy": 0.36383745074272156, + "num_tokens": 1858007661.0, + "step": 3634 + }, + { + "epoch": 0.9829637641968632, + "grad_norm": 3.734375, + "learning_rate": 0.018722455963104706, + "loss": 3.6868, + "mean_token_accuracy": 0.34996774792671204, + "num_tokens": 1858458062.0, + "step": 3635 + }, + { + "epoch": 0.9832341806381828, + "grad_norm": 4.125, + "learning_rate": 0.01872164621487807, + "loss": 3.451, + "mean_token_accuracy": 0.35996803641319275, + "num_tokens": 1858982329.0, + "step": 3636 + }, + { + "epoch": 0.9835045970795024, + "grad_norm": 3.296875, + "learning_rate": 0.01872083622972862, + "loss": 3.3933, + "mean_token_accuracy": 0.3793165385723114, + "num_tokens": 1859506608.0, + "step": 3637 + }, + { + "epoch": 0.9837750135208221, + "grad_norm": 3.3125, + "learning_rate": 0.018720026007681207, + "loss": 3.538, + "mean_token_accuracy": 0.3576661944389343, + "num_tokens": 1860030824.0, + "step": 3638 + }, + { + "epoch": 0.9840454299621417, + "grad_norm": 2.765625, + "learning_rate": 0.018719215548760694, + "loss": 3.4462, + "mean_token_accuracy": 0.38911986351013184, + "num_tokens": 1860473933.0, + "step": 3639 + }, + { + "epoch": 0.9843158464034614, + "grad_norm": 3.703125, + "learning_rate": 0.01871840485299195, + "loss": 3.3524, + "mean_token_accuracy": 0.35595571994781494, + "num_tokens": 1860998180.0, + "step": 3640 + }, + { + "epoch": 0.984586262844781, + "grad_norm": 2.375, + "learning_rate": 0.01871759392039984, + "loss": 3.4134, + "mean_token_accuracy": 0.3880005478858948, + "num_tokens": 1861482472.0, + "step": 3641 + }, + { + "epoch": 0.9848566792861005, + "grad_norm": 3.4375, + "learning_rate": 0.01871678275100926, + "loss": 3.344, + "mean_token_accuracy": 0.37082505226135254, + "num_tokens": 1862006679.0, + "step": 3642 + }, + { + "epoch": 0.9851270957274202, + "grad_norm": 2.109375, + "learning_rate": 0.018715971344845084, + "loss": 3.2136, + "mean_token_accuracy": 0.3888835310935974, + "num_tokens": 1862530887.0, + "step": 3643 + }, + { + "epoch": 0.9853975121687398, + "grad_norm": 3.1875, + "learning_rate": 0.018715159701932223, + "loss": 3.5227, + "mean_token_accuracy": 0.3586391508579254, + "num_tokens": 1863055084.0, + "step": 3644 + }, + { + "epoch": 0.9856679286100595, + "grad_norm": 3.625, + "learning_rate": 0.01871434782229557, + "loss": 3.2166, + "mean_token_accuracy": 0.424213171005249, + "num_tokens": 1863514410.0, + "step": 3645 + }, + { + "epoch": 0.9859383450513791, + "grad_norm": 2.515625, + "learning_rate": 0.01871353570596004, + "loss": 3.211, + "mean_token_accuracy": 0.37433695793151855, + "num_tokens": 1864005388.0, + "step": 3646 + }, + { + "epoch": 0.9862087614926988, + "grad_norm": 3.03125, + "learning_rate": 0.018712723352950548, + "loss": 3.3501, + "mean_token_accuracy": 0.38909339904785156, + "num_tokens": 1864477515.0, + "step": 3647 + }, + { + "epoch": 0.9864791779340184, + "grad_norm": 2.296875, + "learning_rate": 0.01871191076329203, + "loss": 3.1171, + "mean_token_accuracy": 0.3934634327888489, + "num_tokens": 1864994028.0, + "step": 3648 + }, + { + "epoch": 0.986749594375338, + "grad_norm": 2.734375, + "learning_rate": 0.018711097937009404, + "loss": 3.3753, + "mean_token_accuracy": 0.3650849461555481, + "num_tokens": 1865518305.0, + "step": 3649 + }, + { + "epoch": 0.9870200108166577, + "grad_norm": 3.5, + "learning_rate": 0.018710284874127618, + "loss": 3.5195, + "mean_token_accuracy": 0.3658590316772461, + "num_tokens": 1866042424.0, + "step": 3650 + }, + { + "epoch": 0.9872904272579773, + "grad_norm": 131.0, + "learning_rate": 0.018709471574671618, + "loss": 20.4542, + "mean_token_accuracy": 0.0, + "num_tokens": 1866535521.0, + "step": 3651 + }, + { + "epoch": 0.9875608436992969, + "grad_norm": 5.1875, + "learning_rate": 0.018708658038666354, + "loss": 3.5956, + "mean_token_accuracy": 0.35536640882492065, + "num_tokens": 1867037304.0, + "step": 3652 + }, + { + "epoch": 0.9878312601406165, + "grad_norm": 2.359375, + "learning_rate": 0.018707844266136794, + "loss": 3.3825, + "mean_token_accuracy": 0.3678283989429474, + "num_tokens": 1867561406.0, + "step": 3653 + }, + { + "epoch": 0.9881016765819362, + "grad_norm": 2.859375, + "learning_rate": 0.018707030257107907, + "loss": 3.3229, + "mean_token_accuracy": 0.3766152560710907, + "num_tokens": 1868085618.0, + "step": 3654 + }, + { + "epoch": 0.9883720930232558, + "grad_norm": 2.609375, + "learning_rate": 0.018706216011604663, + "loss": 3.1535, + "mean_token_accuracy": 0.3799216151237488, + "num_tokens": 1868609652.0, + "step": 3655 + }, + { + "epoch": 0.9886425094645754, + "grad_norm": 3.71875, + "learning_rate": 0.01870540152965205, + "loss": 3.4402, + "mean_token_accuracy": 0.3698440194129944, + "num_tokens": 1869077297.0, + "step": 3656 + }, + { + "epoch": 0.9889129259058951, + "grad_norm": 3.921875, + "learning_rate": 0.018704586811275063, + "loss": 3.6445, + "mean_token_accuracy": 0.3388441801071167, + "num_tokens": 1869601551.0, + "step": 3657 + }, + { + "epoch": 0.9891833423472147, + "grad_norm": 3.984375, + "learning_rate": 0.01870377185649869, + "loss": 3.559, + "mean_token_accuracy": 0.35927265882492065, + "num_tokens": 1870125768.0, + "step": 3658 + }, + { + "epoch": 0.9894537587885344, + "grad_norm": 2.90625, + "learning_rate": 0.01870295666534794, + "loss": 3.3811, + "mean_token_accuracy": 0.35663318634033203, + "num_tokens": 1870614292.0, + "step": 3659 + }, + { + "epoch": 0.989724175229854, + "grad_norm": 2.65625, + "learning_rate": 0.01870214123784783, + "loss": 3.1529, + "mean_token_accuracy": 0.38217198848724365, + "num_tokens": 1871138391.0, + "step": 3660 + }, + { + "epoch": 0.9899945916711737, + "grad_norm": 3.015625, + "learning_rate": 0.018701325574023375, + "loss": 3.3372, + "mean_token_accuracy": 0.36839091777801514, + "num_tokens": 1871662647.0, + "step": 3661 + }, + { + "epoch": 0.9902650081124933, + "grad_norm": 2.859375, + "learning_rate": 0.018700509673899607, + "loss": 3.4512, + "mean_token_accuracy": 0.370709627866745, + "num_tokens": 1872186846.0, + "step": 3662 + }, + { + "epoch": 0.9905354245538128, + "grad_norm": 2.0625, + "learning_rate": 0.01869969353750155, + "loss": 3.3678, + "mean_token_accuracy": 0.3906267285346985, + "num_tokens": 1872664401.0, + "step": 3663 + }, + { + "epoch": 0.9908058409951325, + "grad_norm": 2.421875, + "learning_rate": 0.018698877164854254, + "loss": 3.2191, + "mean_token_accuracy": 0.42465800046920776, + "num_tokens": 1873123583.0, + "step": 3664 + }, + { + "epoch": 0.9910762574364521, + "grad_norm": 2.578125, + "learning_rate": 0.01869806055598277, + "loss": 3.3709, + "mean_token_accuracy": 0.3898748755455017, + "num_tokens": 1873617469.0, + "step": 3665 + }, + { + "epoch": 0.9913466738777718, + "grad_norm": 2.6875, + "learning_rate": 0.018697243710912144, + "loss": 3.515, + "mean_token_accuracy": 0.36163130402565, + "num_tokens": 1874141716.0, + "step": 3666 + }, + { + "epoch": 0.9916170903190914, + "grad_norm": 2.25, + "learning_rate": 0.01869642662966745, + "loss": 3.1597, + "mean_token_accuracy": 0.3951108455657959, + "num_tokens": 1874665919.0, + "step": 3667 + }, + { + "epoch": 0.991887506760411, + "grad_norm": 2.5625, + "learning_rate": 0.01869560931227375, + "loss": 3.2232, + "mean_token_accuracy": 0.39559972286224365, + "num_tokens": 1875165189.0, + "step": 3668 + }, + { + "epoch": 0.9921579232017307, + "grad_norm": 2.703125, + "learning_rate": 0.018694791758756126, + "loss": 3.0568, + "mean_token_accuracy": 0.40826135873794556, + "num_tokens": 1875676549.0, + "step": 3669 + }, + { + "epoch": 0.9924283396430503, + "grad_norm": 3.515625, + "learning_rate": 0.01869397396913966, + "loss": 3.3411, + "mean_token_accuracy": 0.37726640701293945, + "num_tokens": 1876160038.0, + "step": 3670 + }, + { + "epoch": 0.99269875608437, + "grad_norm": 1.6328125, + "learning_rate": 0.018693155943449447, + "loss": 11.8217, + "mean_token_accuracy": 5.7145425671478733e-05, + "num_tokens": 1876684207.0, + "step": 3671 + }, + { + "epoch": 0.9929691725256896, + "grad_norm": 7.96875, + "learning_rate": 0.018692337681710587, + "loss": 4.1402, + "mean_token_accuracy": 0.29537028074264526, + "num_tokens": 1877208383.0, + "step": 3672 + }, + { + "epoch": 0.9932395889670091, + "grad_norm": 2.125, + "learning_rate": 0.018691519183948183, + "loss": 3.5007, + "mean_token_accuracy": 0.36766326427459717, + "num_tokens": 1877732658.0, + "step": 3673 + }, + { + "epoch": 0.9935100054083288, + "grad_norm": 3.578125, + "learning_rate": 0.018690700450187354, + "loss": 3.2732, + "mean_token_accuracy": 0.35983723402023315, + "num_tokens": 1878256918.0, + "step": 3674 + }, + { + "epoch": 0.9937804218496484, + "grad_norm": 2.765625, + "learning_rate": 0.018689881480453214, + "loss": 3.4616, + "mean_token_accuracy": 0.3621615171432495, + "num_tokens": 1878780905.0, + "step": 3675 + }, + { + "epoch": 0.9940508382909681, + "grad_norm": 3.09375, + "learning_rate": 0.0186890622747709, + "loss": 3.5167, + "mean_token_accuracy": 0.3497070372104645, + "num_tokens": 1879305086.0, + "step": 3676 + }, + { + "epoch": 0.9943212547322877, + "grad_norm": 2.921875, + "learning_rate": 0.018688242833165544, + "loss": 3.2911, + "mean_token_accuracy": 0.3872901201248169, + "num_tokens": 1879791733.0, + "step": 3677 + }, + { + "epoch": 0.9945916711736074, + "grad_norm": 4.6875, + "learning_rate": 0.018687423155662283, + "loss": 3.4584, + "mean_token_accuracy": 0.35945045948028564, + "num_tokens": 1880265014.0, + "step": 3678 + }, + { + "epoch": 0.994862087614927, + "grad_norm": 2.671875, + "learning_rate": 0.018686603242286277, + "loss": 3.6102, + "mean_token_accuracy": 0.36205413937568665, + "num_tokens": 1880778919.0, + "step": 3679 + }, + { + "epoch": 0.9951325040562466, + "grad_norm": 4.53125, + "learning_rate": 0.018685783093062677, + "loss": 3.6351, + "mean_token_accuracy": 0.3430488705635071, + "num_tokens": 1881303107.0, + "step": 3680 + }, + { + "epoch": 0.9954029204975663, + "grad_norm": 2.59375, + "learning_rate": 0.018684962708016648, + "loss": 3.3342, + "mean_token_accuracy": 0.37052518129348755, + "num_tokens": 1881827282.0, + "step": 3681 + }, + { + "epoch": 0.9956733369388859, + "grad_norm": 3.359375, + "learning_rate": 0.018684142087173363, + "loss": 3.2472, + "mean_token_accuracy": 0.3562639355659485, + "num_tokens": 1882351499.0, + "step": 3682 + }, + { + "epoch": 0.9959437533802055, + "grad_norm": 2.375, + "learning_rate": 0.018683321230558003, + "loss": 3.2899, + "mean_token_accuracy": 0.39067041873931885, + "num_tokens": 1882875754.0, + "step": 3683 + }, + { + "epoch": 0.9962141698215251, + "grad_norm": 3.890625, + "learning_rate": 0.01868250013819575, + "loss": 3.4183, + "mean_token_accuracy": 0.35467833280563354, + "num_tokens": 1883399966.0, + "step": 3684 + }, + { + "epoch": 0.9964845862628448, + "grad_norm": 2.671875, + "learning_rate": 0.018681678810111804, + "loss": 3.121, + "mean_token_accuracy": 0.37358593940734863, + "num_tokens": 1883924079.0, + "step": 3685 + }, + { + "epoch": 0.9967550027041644, + "grad_norm": 3.4375, + "learning_rate": 0.01868085724633136, + "loss": 3.469, + "mean_token_accuracy": 0.3621077239513397, + "num_tokens": 1884448238.0, + "step": 3686 + }, + { + "epoch": 0.997025419145484, + "grad_norm": 2.796875, + "learning_rate": 0.018680035446879628, + "loss": 3.4039, + "mean_token_accuracy": 0.37653297185897827, + "num_tokens": 1884972508.0, + "step": 3687 + }, + { + "epoch": 0.9972958355868037, + "grad_norm": 3.03125, + "learning_rate": 0.01867921341178182, + "loss": 3.4539, + "mean_token_accuracy": 0.3677895665168762, + "num_tokens": 1885496775.0, + "step": 3688 + }, + { + "epoch": 0.9975662520281233, + "grad_norm": 2.96875, + "learning_rate": 0.018678391141063167, + "loss": 3.3418, + "mean_token_accuracy": 0.3780708909034729, + "num_tokens": 1886003934.0, + "step": 3689 + }, + { + "epoch": 0.997836668469443, + "grad_norm": 3.484375, + "learning_rate": 0.01867756863474889, + "loss": 3.2515, + "mean_token_accuracy": 0.3659766912460327, + "num_tokens": 1886528197.0, + "step": 3690 + }, + { + "epoch": 0.9981070849107626, + "grad_norm": 27.0, + "learning_rate": 0.01867674589286423, + "loss": 15.3974, + "mean_token_accuracy": 0.010531876236200333, + "num_tokens": 1887052448.0, + "step": 3691 + }, + { + "epoch": 0.9983775013520823, + "grad_norm": 7.53125, + "learning_rate": 0.01867592291543443, + "loss": 3.5044, + "mean_token_accuracy": 0.3354979157447815, + "num_tokens": 1887576709.0, + "step": 3692 + }, + { + "epoch": 0.9986479177934018, + "grad_norm": 3.5625, + "learning_rate": 0.018675099702484738, + "loss": 3.6467, + "mean_token_accuracy": 0.34611836075782776, + "num_tokens": 1888100991.0, + "step": 3693 + }, + { + "epoch": 0.9989183342347214, + "grad_norm": 3.40625, + "learning_rate": 0.01867427625404042, + "loss": 3.5097, + "mean_token_accuracy": 0.351121723651886, + "num_tokens": 1888625266.0, + "step": 3694 + }, + { + "epoch": 0.9991887506760411, + "grad_norm": 3.203125, + "learning_rate": 0.018673452570126738, + "loss": 3.5512, + "mean_token_accuracy": 0.3497968316078186, + "num_tokens": 1889149502.0, + "step": 3695 + }, + { + "epoch": 0.9994591671173607, + "grad_norm": 3.109375, + "learning_rate": 0.018672628650768963, + "loss": 3.5278, + "mean_token_accuracy": 0.38647687435150146, + "num_tokens": 1889538206.0, + "step": 3696 + }, + { + "epoch": 0.9997295835586804, + "grad_norm": 3.546875, + "learning_rate": 0.01867180449599238, + "loss": 3.3324, + "mean_token_accuracy": 0.3770413398742676, + "num_tokens": 1890062338.0, + "step": 3697 + }, + { + "epoch": 1.0, + "grad_norm": 3.46875, + "learning_rate": 0.01867098010582227, + "loss": 3.4794, + "mean_token_accuracy": 0.3738538324832916, + "num_tokens": 1890324479.0, + "step": 3698 + }, + { + "epoch": 1.0002704164413196, + "grad_norm": 3.140625, + "learning_rate": 0.018670155480283935, + "loss": 3.394, + "mean_token_accuracy": 0.3464309871196747, + "num_tokens": 1890848514.0, + "step": 3699 + }, + { + "epoch": 1.0005408328826393, + "grad_norm": 2.390625, + "learning_rate": 0.01866933061940267, + "loss": 3.5316, + "mean_token_accuracy": 0.36596792936325073, + "num_tokens": 1891372686.0, + "step": 3700 + }, + { + "epoch": 1.000811249323959, + "grad_norm": 3.3125, + "learning_rate": 0.018668505523203788, + "loss": 3.2945, + "mean_token_accuracy": 0.38769248127937317, + "num_tokens": 1891863149.0, + "step": 3701 + }, + { + "epoch": 1.0010816657652786, + "grad_norm": 3.609375, + "learning_rate": 0.018667680191712607, + "loss": 3.4234, + "mean_token_accuracy": 0.37723806500434875, + "num_tokens": 1892352735.0, + "step": 3702 + }, + { + "epoch": 1.0013520822065982, + "grad_norm": 3.0, + "learning_rate": 0.01866685462495445, + "loss": 3.3437, + "mean_token_accuracy": 0.35179197788238525, + "num_tokens": 1892876872.0, + "step": 3703 + }, + { + "epoch": 1.0016224986479179, + "grad_norm": 5.28125, + "learning_rate": 0.018666028822954643, + "loss": 3.2592, + "mean_token_accuracy": 0.3912816643714905, + "num_tokens": 1893401106.0, + "step": 3704 + }, + { + "epoch": 1.0018929150892375, + "grad_norm": 2.203125, + "learning_rate": 0.01866520278573853, + "loss": 3.412, + "mean_token_accuracy": 0.37883323431015015, + "num_tokens": 1893925293.0, + "step": 3705 + }, + { + "epoch": 1.0021633315305571, + "grad_norm": 3.0625, + "learning_rate": 0.01866437651333145, + "loss": 3.4053, + "mean_token_accuracy": 0.37246978282928467, + "num_tokens": 1894441583.0, + "step": 3706 + }, + { + "epoch": 1.0024337479718768, + "grad_norm": 3.109375, + "learning_rate": 0.018663550005758762, + "loss": 3.2831, + "mean_token_accuracy": 0.38675636053085327, + "num_tokens": 1894965206.0, + "step": 3707 + }, + { + "epoch": 1.0027041644131964, + "grad_norm": 3.171875, + "learning_rate": 0.018662723263045824, + "loss": 3.482, + "mean_token_accuracy": 0.35601598024368286, + "num_tokens": 1895489403.0, + "step": 3708 + }, + { + "epoch": 1.0029745808545159, + "grad_norm": 3.4375, + "learning_rate": 0.018661896285217996, + "loss": 3.414, + "mean_token_accuracy": 0.37078964710235596, + "num_tokens": 1896013674.0, + "step": 3709 + }, + { + "epoch": 1.0032449972958355, + "grad_norm": 2.859375, + "learning_rate": 0.018661069072300664, + "loss": 3.2662, + "mean_token_accuracy": 0.3880842328071594, + "num_tokens": 1896511556.0, + "step": 3710 + }, + { + "epoch": 1.0035154137371551, + "grad_norm": 3.03125, + "learning_rate": 0.018660241624319198, + "loss": 10.8517, + "mean_token_accuracy": 0.0, + "num_tokens": 1897019577.0, + "step": 3711 + }, + { + "epoch": 1.0037858301784748, + "grad_norm": 7.875, + "learning_rate": 0.018659413941298997, + "loss": 4.0372, + "mean_token_accuracy": 0.28060221672058105, + "num_tokens": 1897492233.0, + "step": 3712 + }, + { + "epoch": 1.0040562466197944, + "grad_norm": 2.859375, + "learning_rate": 0.01865858602326545, + "loss": 3.6625, + "mean_token_accuracy": 0.33254605531692505, + "num_tokens": 1898016404.0, + "step": 3713 + }, + { + "epoch": 1.004326663061114, + "grad_norm": 2.5625, + "learning_rate": 0.01865775787024396, + "loss": 3.3245, + "mean_token_accuracy": 0.3641456663608551, + "num_tokens": 1898535422.0, + "step": 3714 + }, + { + "epoch": 1.0045970795024337, + "grad_norm": 2.421875, + "learning_rate": 0.018656929482259936, + "loss": 3.3053, + "mean_token_accuracy": 0.3721989393234253, + "num_tokens": 1899059560.0, + "step": 3715 + }, + { + "epoch": 1.0048674959437534, + "grad_norm": 3.0625, + "learning_rate": 0.0186561008593388, + "loss": 3.5431, + "mean_token_accuracy": 0.3511878252029419, + "num_tokens": 1899583827.0, + "step": 3716 + }, + { + "epoch": 1.005137912385073, + "grad_norm": 3.8125, + "learning_rate": 0.01865527200150598, + "loss": 3.494, + "mean_token_accuracy": 0.36205393075942993, + "num_tokens": 1900023029.0, + "step": 3717 + }, + { + "epoch": 1.0054083288263926, + "grad_norm": 3.34375, + "learning_rate": 0.018654442908786897, + "loss": 3.3593, + "mean_token_accuracy": 0.37394022941589355, + "num_tokens": 1900547313.0, + "step": 3718 + }, + { + "epoch": 1.0056787452677123, + "grad_norm": 2.765625, + "learning_rate": 0.018653613581206995, + "loss": 3.3101, + "mean_token_accuracy": 0.3848947286605835, + "num_tokens": 1901019097.0, + "step": 3719 + }, + { + "epoch": 1.005949161709032, + "grad_norm": 2.984375, + "learning_rate": 0.01865278401879172, + "loss": 3.4478, + "mean_token_accuracy": 0.3819316625595093, + "num_tokens": 1901492270.0, + "step": 3720 + }, + { + "epoch": 1.0062195781503516, + "grad_norm": 3.578125, + "learning_rate": 0.018651954221566527, + "loss": 3.5764, + "mean_token_accuracy": 0.3804694414138794, + "num_tokens": 1902016535.0, + "step": 3721 + }, + { + "epoch": 1.0064899945916712, + "grad_norm": 4.0625, + "learning_rate": 0.01865112418955688, + "loss": 3.3629, + "mean_token_accuracy": 0.3585045039653778, + "num_tokens": 1902540817.0, + "step": 3722 + }, + { + "epoch": 1.0067604110329909, + "grad_norm": 2.5625, + "learning_rate": 0.018650293922788237, + "loss": 3.3312, + "mean_token_accuracy": 0.36089956760406494, + "num_tokens": 1903065076.0, + "step": 3723 + }, + { + "epoch": 1.0070308274743105, + "grad_norm": 3.015625, + "learning_rate": 0.018649463421286077, + "loss": 3.3959, + "mean_token_accuracy": 0.3655688762664795, + "num_tokens": 1903579433.0, + "step": 3724 + }, + { + "epoch": 1.0073012439156301, + "grad_norm": 2.953125, + "learning_rate": 0.018648632685075887, + "loss": 3.5465, + "mean_token_accuracy": 0.3748162090778351, + "num_tokens": 1904103598.0, + "step": 3725 + }, + { + "epoch": 1.0075716603569498, + "grad_norm": 3.234375, + "learning_rate": 0.01864780171418315, + "loss": 3.2322, + "mean_token_accuracy": 0.37929072976112366, + "num_tokens": 1904627743.0, + "step": 3726 + }, + { + "epoch": 1.0078420767982694, + "grad_norm": 3.0, + "learning_rate": 0.01864697050863337, + "loss": 3.2822, + "mean_token_accuracy": 0.35403579473495483, + "num_tokens": 1905151985.0, + "step": 3727 + }, + { + "epoch": 1.008112493239589, + "grad_norm": 3.25, + "learning_rate": 0.018646139068452045, + "loss": 3.4196, + "mean_token_accuracy": 0.3840349316596985, + "num_tokens": 1905630300.0, + "step": 3728 + }, + { + "epoch": 1.0083829096809085, + "grad_norm": 10.0625, + "learning_rate": 0.018645307393664688, + "loss": 3.5805, + "mean_token_accuracy": 0.3651992678642273, + "num_tokens": 1906154463.0, + "step": 3729 + }, + { + "epoch": 1.0086533261222281, + "grad_norm": 2.015625, + "learning_rate": 0.018644475484296815, + "loss": 3.3005, + "mean_token_accuracy": 0.39205121994018555, + "num_tokens": 1906656754.0, + "step": 3730 + }, + { + "epoch": 1.0089237425635478, + "grad_norm": 74.0, + "learning_rate": 0.018643643340373953, + "loss": 16.3625, + "mean_token_accuracy": 7.908187399152666e-06, + "num_tokens": 1907181009.0, + "step": 3731 + }, + { + "epoch": 1.0091941590048674, + "grad_norm": 8.3125, + "learning_rate": 0.018642810961921635, + "loss": 4.1306, + "mean_token_accuracy": 0.3271692991256714, + "num_tokens": 1907642860.0, + "step": 3732 + }, + { + "epoch": 1.009464575446187, + "grad_norm": 3.5, + "learning_rate": 0.018641978348965402, + "loss": 3.5436, + "mean_token_accuracy": 0.34063535928726196, + "num_tokens": 1908167042.0, + "step": 3733 + }, + { + "epoch": 1.0097349918875067, + "grad_norm": 3.90625, + "learning_rate": 0.018641145501530797, + "loss": 3.4673, + "mean_token_accuracy": 0.34974631667137146, + "num_tokens": 1908691299.0, + "step": 3734 + }, + { + "epoch": 1.0100054083288263, + "grad_norm": 2.34375, + "learning_rate": 0.01864031241964338, + "loss": 3.1717, + "mean_token_accuracy": 0.393732488155365, + "num_tokens": 1909215481.0, + "step": 3735 + }, + { + "epoch": 1.010275824770146, + "grad_norm": 2.71875, + "learning_rate": 0.01863947910332871, + "loss": 3.4525, + "mean_token_accuracy": 0.39286959171295166, + "num_tokens": 1909704435.0, + "step": 3736 + }, + { + "epoch": 1.0105462412114656, + "grad_norm": 2.78125, + "learning_rate": 0.01863864555261235, + "loss": 3.6096, + "mean_token_accuracy": 0.37892115116119385, + "num_tokens": 1910171103.0, + "step": 3737 + }, + { + "epoch": 1.0108166576527853, + "grad_norm": 3.71875, + "learning_rate": 0.01863781176751989, + "loss": 3.4915, + "mean_token_accuracy": 0.3387328088283539, + "num_tokens": 1910695370.0, + "step": 3738 + }, + { + "epoch": 1.011087074094105, + "grad_norm": 3.515625, + "learning_rate": 0.018636977748076895, + "loss": 3.578, + "mean_token_accuracy": 0.34043192863464355, + "num_tokens": 1911219601.0, + "step": 3739 + }, + { + "epoch": 1.0113574905354246, + "grad_norm": 2.46875, + "learning_rate": 0.018636143494308963, + "loss": 3.3382, + "mean_token_accuracy": 0.37974172830581665, + "num_tokens": 1911743583.0, + "step": 3740 + }, + { + "epoch": 1.0116279069767442, + "grad_norm": 2.3125, + "learning_rate": 0.018635309006241697, + "loss": 3.5951, + "mean_token_accuracy": 0.3443930745124817, + "num_tokens": 1912267676.0, + "step": 3741 + }, + { + "epoch": 1.0118983234180638, + "grad_norm": 2.828125, + "learning_rate": 0.018634474283900696, + "loss": 3.2599, + "mean_token_accuracy": 0.3769374489784241, + "num_tokens": 1912762993.0, + "step": 3742 + }, + { + "epoch": 1.0121687398593835, + "grad_norm": 3.1875, + "learning_rate": 0.01863363932731157, + "loss": 3.3011, + "mean_token_accuracy": 0.3743517994880676, + "num_tokens": 1913287131.0, + "step": 3743 + }, + { + "epoch": 1.0124391563007031, + "grad_norm": 3.515625, + "learning_rate": 0.018632804136499943, + "loss": 3.3657, + "mean_token_accuracy": 0.39142048358917236, + "num_tokens": 1913764543.0, + "step": 3744 + }, + { + "epoch": 1.0127095727420228, + "grad_norm": 3.5, + "learning_rate": 0.018631968711491437, + "loss": 3.5152, + "mean_token_accuracy": 0.38488417863845825, + "num_tokens": 1914231648.0, + "step": 3745 + }, + { + "epoch": 1.0129799891833424, + "grad_norm": 3.25, + "learning_rate": 0.018631133052311687, + "loss": 3.2858, + "mean_token_accuracy": 0.37396693229675293, + "num_tokens": 1914755926.0, + "step": 3746 + }, + { + "epoch": 1.013250405624662, + "grad_norm": 2.96875, + "learning_rate": 0.018630297158986335, + "loss": 3.2983, + "mean_token_accuracy": 0.38369983434677124, + "num_tokens": 1915280120.0, + "step": 3747 + }, + { + "epoch": 1.0135208220659817, + "grad_norm": 2.875, + "learning_rate": 0.018629461031541023, + "loss": 3.3788, + "mean_token_accuracy": 0.369184672832489, + "num_tokens": 1915804281.0, + "step": 3748 + }, + { + "epoch": 1.0137912385073014, + "grad_norm": 2.9375, + "learning_rate": 0.018628624670001415, + "loss": 3.3244, + "mean_token_accuracy": 0.40329423546791077, + "num_tokens": 1916280472.0, + "step": 3749 + }, + { + "epoch": 1.0140616549486208, + "grad_norm": 2.765625, + "learning_rate": 0.018627788074393165, + "loss": 3.2113, + "mean_token_accuracy": 0.3883340060710907, + "num_tokens": 1916804710.0, + "step": 3750 + }, + { + "epoch": 1.0143320713899404, + "grad_norm": 7.5625, + "learning_rate": 0.018626951244741945, + "loss": 11.9911, + "mean_token_accuracy": 0.0, + "num_tokens": 1917328976.0, + "step": 3751 + }, + { + "epoch": 1.01460248783126, + "grad_norm": 6.09375, + "learning_rate": 0.01862611418107343, + "loss": 3.7822, + "mean_token_accuracy": 0.3268886208534241, + "num_tokens": 1917770998.0, + "step": 3752 + }, + { + "epoch": 1.0148729042725797, + "grad_norm": 2.046875, + "learning_rate": 0.018625276883413307, + "loss": 3.3713, + "mean_token_accuracy": 0.3701677918434143, + "num_tokens": 1918295158.0, + "step": 3753 + }, + { + "epoch": 1.0151433207138993, + "grad_norm": 3.34375, + "learning_rate": 0.018624439351787262, + "loss": 3.4047, + "mean_token_accuracy": 0.3673887252807617, + "num_tokens": 1918819275.0, + "step": 3754 + }, + { + "epoch": 1.015413737155219, + "grad_norm": 2.984375, + "learning_rate": 0.018623601586221, + "loss": 3.5335, + "mean_token_accuracy": 0.376223087310791, + "num_tokens": 1919343449.0, + "step": 3755 + }, + { + "epoch": 1.0156841535965386, + "grad_norm": 4.0, + "learning_rate": 0.01862276358674022, + "loss": 3.4401, + "mean_token_accuracy": 0.35223281383514404, + "num_tokens": 1919867606.0, + "step": 3756 + }, + { + "epoch": 1.0159545700378583, + "grad_norm": 4.96875, + "learning_rate": 0.018621925353370636, + "loss": 3.1652, + "mean_token_accuracy": 0.4422222077846527, + "num_tokens": 1920391781.0, + "step": 3757 + }, + { + "epoch": 1.016224986479178, + "grad_norm": 2.75, + "learning_rate": 0.018621086886137967, + "loss": 3.4059, + "mean_token_accuracy": 0.3657386302947998, + "num_tokens": 1920915961.0, + "step": 3758 + }, + { + "epoch": 1.0164954029204976, + "grad_norm": 4.21875, + "learning_rate": 0.01862024818506794, + "loss": 3.5626, + "mean_token_accuracy": 0.3653976619243622, + "num_tokens": 1921440231.0, + "step": 3759 + }, + { + "epoch": 1.0167658193618172, + "grad_norm": 2.484375, + "learning_rate": 0.01861940925018629, + "loss": 3.3964, + "mean_token_accuracy": 0.3621499538421631, + "num_tokens": 1921964379.0, + "step": 3760 + }, + { + "epoch": 1.0170362358031368, + "grad_norm": 3.046875, + "learning_rate": 0.018618570081518755, + "loss": 3.3258, + "mean_token_accuracy": 0.3819407820701599, + "num_tokens": 1922488592.0, + "step": 3761 + }, + { + "epoch": 1.0173066522444565, + "grad_norm": 3.125, + "learning_rate": 0.018617730679091084, + "loss": 3.4125, + "mean_token_accuracy": 0.3702578842639923, + "num_tokens": 1923006901.0, + "step": 3762 + }, + { + "epoch": 1.0175770686857761, + "grad_norm": 2.703125, + "learning_rate": 0.018616891042929035, + "loss": 3.0893, + "mean_token_accuracy": 0.3906392753124237, + "num_tokens": 1923531158.0, + "step": 3763 + }, + { + "epoch": 1.0178474851270958, + "grad_norm": 3.140625, + "learning_rate": 0.01861605117305837, + "loss": 3.265, + "mean_token_accuracy": 0.3855467438697815, + "num_tokens": 1924055335.0, + "step": 3764 + }, + { + "epoch": 1.0181179015684154, + "grad_norm": 3.3125, + "learning_rate": 0.01861521106950486, + "loss": 3.4317, + "mean_token_accuracy": 0.363084077835083, + "num_tokens": 1924579459.0, + "step": 3765 + }, + { + "epoch": 1.018388318009735, + "grad_norm": 3.046875, + "learning_rate": 0.018614370732294275, + "loss": 3.3855, + "mean_token_accuracy": 0.3733553886413574, + "num_tokens": 1925103629.0, + "step": 3766 + }, + { + "epoch": 1.0186587344510547, + "grad_norm": 4.25, + "learning_rate": 0.018613530161452407, + "loss": 3.4675, + "mean_token_accuracy": 0.3514488935470581, + "num_tokens": 1925627873.0, + "step": 3767 + }, + { + "epoch": 1.0189291508923743, + "grad_norm": 3.28125, + "learning_rate": 0.018612689357005045, + "loss": 3.3394, + "mean_token_accuracy": 0.40062814950942993, + "num_tokens": 1926130018.0, + "step": 3768 + }, + { + "epoch": 1.019199567333694, + "grad_norm": 3.25, + "learning_rate": 0.018611848318977983, + "loss": 3.5634, + "mean_token_accuracy": 0.3684934079647064, + "num_tokens": 1926609931.0, + "step": 3769 + }, + { + "epoch": 1.0194699837750134, + "grad_norm": 2.890625, + "learning_rate": 0.01861100704739703, + "loss": 3.6346, + "mean_token_accuracy": 0.3472124934196472, + "num_tokens": 1927110597.0, + "step": 3770 + }, + { + "epoch": 1.019740400216333, + "grad_norm": 33.5, + "learning_rate": 0.018610165542287997, + "loss": 9.5413, + "mean_token_accuracy": 0.02980058267712593, + "num_tokens": 1927634879.0, + "step": 3771 + }, + { + "epoch": 1.0200108166576527, + "grad_norm": 13.6875, + "learning_rate": 0.018609323803676702, + "loss": 3.809, + "mean_token_accuracy": 0.31697267293930054, + "num_tokens": 1928159126.0, + "step": 3772 + }, + { + "epoch": 1.0202812330989723, + "grad_norm": 3.03125, + "learning_rate": 0.01860848183158898, + "loss": 3.6155, + "mean_token_accuracy": 0.3007943034172058, + "num_tokens": 1928683279.0, + "step": 3773 + }, + { + "epoch": 1.020551649540292, + "grad_norm": 2.953125, + "learning_rate": 0.01860763962605066, + "loss": 3.4647, + "mean_token_accuracy": 0.3546658754348755, + "num_tokens": 1929207483.0, + "step": 3774 + }, + { + "epoch": 1.0208220659816116, + "grad_norm": 2.953125, + "learning_rate": 0.018606797187087586, + "loss": 3.515, + "mean_token_accuracy": 0.3551124632358551, + "num_tokens": 1929731731.0, + "step": 3775 + }, + { + "epoch": 1.0210924824229313, + "grad_norm": 3.0, + "learning_rate": 0.018605954514725603, + "loss": 3.4926, + "mean_token_accuracy": 0.36336296796798706, + "num_tokens": 1930256016.0, + "step": 3776 + }, + { + "epoch": 1.021362898864251, + "grad_norm": 2.640625, + "learning_rate": 0.018605111608990567, + "loss": 3.4895, + "mean_token_accuracy": 0.3893486559391022, + "num_tokens": 1930721057.0, + "step": 3777 + }, + { + "epoch": 1.0216333153055706, + "grad_norm": 2.375, + "learning_rate": 0.01860426846990834, + "loss": 3.1986, + "mean_token_accuracy": 0.39324304461479187, + "num_tokens": 1931182030.0, + "step": 3778 + }, + { + "epoch": 1.0219037317468902, + "grad_norm": 2.453125, + "learning_rate": 0.018603425097504792, + "loss": 3.4639, + "mean_token_accuracy": 0.364321231842041, + "num_tokens": 1931706261.0, + "step": 3779 + }, + { + "epoch": 1.0221741481882098, + "grad_norm": 3.28125, + "learning_rate": 0.018602581491805804, + "loss": 3.342, + "mean_token_accuracy": 0.3708338737487793, + "num_tokens": 1932180011.0, + "step": 3780 + }, + { + "epoch": 1.0224445646295295, + "grad_norm": 3.40625, + "learning_rate": 0.01860173765283726, + "loss": 3.5528, + "mean_token_accuracy": 0.36718302965164185, + "num_tokens": 1932692739.0, + "step": 3781 + }, + { + "epoch": 1.0227149810708491, + "grad_norm": 3.125, + "learning_rate": 0.01860089358062505, + "loss": 3.3213, + "mean_token_accuracy": 0.3923090696334839, + "num_tokens": 1933151668.0, + "step": 3782 + }, + { + "epoch": 1.0229853975121688, + "grad_norm": 2.90625, + "learning_rate": 0.01860004927519507, + "loss": 3.3141, + "mean_token_accuracy": 0.39444372057914734, + "num_tokens": 1933675874.0, + "step": 3783 + }, + { + "epoch": 1.0232558139534884, + "grad_norm": 3.765625, + "learning_rate": 0.01859920473657323, + "loss": 3.6088, + "mean_token_accuracy": 0.33185458183288574, + "num_tokens": 1934200118.0, + "step": 3784 + }, + { + "epoch": 1.023526230394808, + "grad_norm": 2.703125, + "learning_rate": 0.018598359964785437, + "loss": 3.4382, + "mean_token_accuracy": 0.36905527114868164, + "num_tokens": 1934724293.0, + "step": 3785 + }, + { + "epoch": 1.0237966468361277, + "grad_norm": 3.40625, + "learning_rate": 0.018597514959857616, + "loss": 3.4836, + "mean_token_accuracy": 0.36716049909591675, + "num_tokens": 1935213230.0, + "step": 3786 + }, + { + "epoch": 1.0240670632774473, + "grad_norm": 3.125, + "learning_rate": 0.018596669721815697, + "loss": 3.5274, + "mean_token_accuracy": 0.37536054849624634, + "num_tokens": 1935737489.0, + "step": 3787 + }, + { + "epoch": 1.024337479718767, + "grad_norm": 2.59375, + "learning_rate": 0.018595824250685605, + "loss": 3.0512, + "mean_token_accuracy": 0.37884750962257385, + "num_tokens": 1936261620.0, + "step": 3788 + }, + { + "epoch": 1.0246078961600866, + "grad_norm": 2.609375, + "learning_rate": 0.01859497854649329, + "loss": 3.3124, + "mean_token_accuracy": 0.3786991238594055, + "num_tokens": 1936667211.0, + "step": 3789 + }, + { + "epoch": 1.0248783126014063, + "grad_norm": 3.0, + "learning_rate": 0.0185941326092647, + "loss": 3.3474, + "mean_token_accuracy": 0.35577455163002014, + "num_tokens": 1937191426.0, + "step": 3790 + }, + { + "epoch": 1.0251487290427257, + "grad_norm": 2.109375, + "learning_rate": 0.018593286439025785, + "loss": 11.411, + "mean_token_accuracy": 5.393335595726967e-05, + "num_tokens": 1937715646.0, + "step": 3791 + }, + { + "epoch": 1.0254191454840453, + "grad_norm": 10.375, + "learning_rate": 0.018592440035802512, + "loss": 4.2256, + "mean_token_accuracy": 0.30584782361984253, + "num_tokens": 1938148518.0, + "step": 3792 + }, + { + "epoch": 1.025689561925365, + "grad_norm": 3.375, + "learning_rate": 0.018591593399620854, + "loss": 3.6417, + "mean_token_accuracy": 0.3363719582557678, + "num_tokens": 1938672699.0, + "step": 3793 + }, + { + "epoch": 1.0259599783666846, + "grad_norm": 3.78125, + "learning_rate": 0.018590746530506783, + "loss": 3.6664, + "mean_token_accuracy": 0.354971706867218, + "num_tokens": 1939196744.0, + "step": 3794 + }, + { + "epoch": 1.0262303948080043, + "grad_norm": 3.5625, + "learning_rate": 0.018589899428486288, + "loss": 3.8189, + "mean_token_accuracy": 0.2981141209602356, + "num_tokens": 1939721024.0, + "step": 3795 + }, + { + "epoch": 1.026500811249324, + "grad_norm": 2.703125, + "learning_rate": 0.018589052093585353, + "loss": 3.311, + "mean_token_accuracy": 0.3790777623653412, + "num_tokens": 1940245221.0, + "step": 3796 + }, + { + "epoch": 1.0267712276906436, + "grad_norm": 2.703125, + "learning_rate": 0.018588204525829984, + "loss": 3.5777, + "mean_token_accuracy": 0.3365553021430969, + "num_tokens": 1940769403.0, + "step": 3797 + }, + { + "epoch": 1.0270416441319632, + "grad_norm": 2.59375, + "learning_rate": 0.01858735672524619, + "loss": 3.499, + "mean_token_accuracy": 0.3613322675228119, + "num_tokens": 1941293514.0, + "step": 3798 + }, + { + "epoch": 1.0273120605732828, + "grad_norm": 3.96875, + "learning_rate": 0.01858650869185997, + "loss": 3.5801, + "mean_token_accuracy": 0.36462587118148804, + "num_tokens": 1941817769.0, + "step": 3799 + }, + { + "epoch": 1.0275824770146025, + "grad_norm": 2.421875, + "learning_rate": 0.018585660425697356, + "loss": 3.4103, + "mean_token_accuracy": 0.3774913549423218, + "num_tokens": 1942342006.0, + "step": 3800 + }, + { + "epoch": 1.0278528934559221, + "grad_norm": 3.140625, + "learning_rate": 0.018584811926784375, + "loss": 3.2783, + "mean_token_accuracy": 0.38138070702552795, + "num_tokens": 1942834853.0, + "step": 3801 + }, + { + "epoch": 1.0281233098972418, + "grad_norm": 2.859375, + "learning_rate": 0.018583963195147057, + "loss": 3.0424, + "mean_token_accuracy": 0.38481923937797546, + "num_tokens": 1943359091.0, + "step": 3802 + }, + { + "epoch": 1.0283937263385614, + "grad_norm": 3.0625, + "learning_rate": 0.018583114230811444, + "loss": 3.3681, + "mean_token_accuracy": 0.3549402356147766, + "num_tokens": 1943883215.0, + "step": 3803 + }, + { + "epoch": 1.028664142779881, + "grad_norm": 3.171875, + "learning_rate": 0.018582265033803586, + "loss": 3.4893, + "mean_token_accuracy": 0.3548034131526947, + "num_tokens": 1944407408.0, + "step": 3804 + }, + { + "epoch": 1.0289345592212007, + "grad_norm": 2.765625, + "learning_rate": 0.01858141560414954, + "loss": 3.3724, + "mean_token_accuracy": 0.36739784479141235, + "num_tokens": 1944931677.0, + "step": 3805 + }, + { + "epoch": 1.0292049756625203, + "grad_norm": 3.6875, + "learning_rate": 0.018580565941875366, + "loss": 3.3077, + "mean_token_accuracy": 0.3621177673339844, + "num_tokens": 1945455770.0, + "step": 3806 + }, + { + "epoch": 1.02947539210384, + "grad_norm": 2.359375, + "learning_rate": 0.018579716047007137, + "loss": 3.3877, + "mean_token_accuracy": 0.3785228133201599, + "num_tokens": 1945979938.0, + "step": 3807 + }, + { + "epoch": 1.0297458085451596, + "grad_norm": 3.5625, + "learning_rate": 0.01857886591957093, + "loss": 3.0649, + "mean_token_accuracy": 0.4016992449760437, + "num_tokens": 1946504213.0, + "step": 3808 + }, + { + "epoch": 1.0300162249864793, + "grad_norm": 2.4375, + "learning_rate": 0.018578015559592827, + "loss": 3.2717, + "mean_token_accuracy": 0.4045146107673645, + "num_tokens": 1946989285.0, + "step": 3809 + }, + { + "epoch": 1.030286641427799, + "grad_norm": 3.3125, + "learning_rate": 0.018577164967098922, + "loss": 3.221, + "mean_token_accuracy": 0.36550670862197876, + "num_tokens": 1947471231.0, + "step": 3810 + }, + { + "epoch": 1.0305570578691183, + "grad_norm": 103.0, + "learning_rate": 0.018576314142115313, + "loss": 17.377, + "mean_token_accuracy": 6.631825817748904e-05, + "num_tokens": 1947976838.0, + "step": 3811 + }, + { + "epoch": 1.030827474310438, + "grad_norm": 6.875, + "learning_rate": 0.018575463084668106, + "loss": 3.7808, + "mean_token_accuracy": 0.3092828094959259, + "num_tokens": 1948501108.0, + "step": 3812 + }, + { + "epoch": 1.0310978907517576, + "grad_norm": 2.328125, + "learning_rate": 0.018574611794783414, + "loss": 3.7076, + "mean_token_accuracy": 0.3359013795852661, + "num_tokens": 1949025382.0, + "step": 3813 + }, + { + "epoch": 1.0313683071930773, + "grad_norm": 2.734375, + "learning_rate": 0.018573760272487354, + "loss": 3.4152, + "mean_token_accuracy": 0.3439757823944092, + "num_tokens": 1949538380.0, + "step": 3814 + }, + { + "epoch": 1.031638723634397, + "grad_norm": 3.515625, + "learning_rate": 0.018572908517806058, + "loss": 3.5007, + "mean_token_accuracy": 0.37372010946273804, + "num_tokens": 1950062632.0, + "step": 3815 + }, + { + "epoch": 1.0319091400757165, + "grad_norm": 3.125, + "learning_rate": 0.018572056530765657, + "loss": 3.4833, + "mean_token_accuracy": 0.3863615095615387, + "num_tokens": 1950563407.0, + "step": 3816 + }, + { + "epoch": 1.0321795565170362, + "grad_norm": 3.046875, + "learning_rate": 0.018571204311392295, + "loss": 3.6257, + "mean_token_accuracy": 0.35672202706336975, + "num_tokens": 1951087620.0, + "step": 3817 + }, + { + "epoch": 1.0324499729583558, + "grad_norm": 2.53125, + "learning_rate": 0.01857035185971212, + "loss": 3.5602, + "mean_token_accuracy": 0.3739129304885864, + "num_tokens": 1951611883.0, + "step": 3818 + }, + { + "epoch": 1.0327203893996755, + "grad_norm": 3.09375, + "learning_rate": 0.018569499175751285, + "loss": 3.425, + "mean_token_accuracy": 0.3530862629413605, + "num_tokens": 1952130003.0, + "step": 3819 + }, + { + "epoch": 1.0329908058409951, + "grad_norm": 2.5625, + "learning_rate": 0.018568646259535956, + "loss": 3.2929, + "mean_token_accuracy": 0.3720603585243225, + "num_tokens": 1952654268.0, + "step": 3820 + }, + { + "epoch": 1.0332612222823148, + "grad_norm": 2.796875, + "learning_rate": 0.018567793111092298, + "loss": 3.4622, + "mean_token_accuracy": 0.3657662868499756, + "num_tokens": 1953178535.0, + "step": 3821 + }, + { + "epoch": 1.0335316387236344, + "grad_norm": 3.015625, + "learning_rate": 0.018566939730446496, + "loss": 3.3295, + "mean_token_accuracy": 0.40400928258895874, + "num_tokens": 1953638506.0, + "step": 3822 + }, + { + "epoch": 1.033802055164954, + "grad_norm": 3.296875, + "learning_rate": 0.018566086117624732, + "loss": 3.3379, + "mean_token_accuracy": 0.35119497776031494, + "num_tokens": 1954162614.0, + "step": 3823 + }, + { + "epoch": 1.0340724716062737, + "grad_norm": 13.9375, + "learning_rate": 0.018565232272653193, + "loss": 3.052, + "mean_token_accuracy": 0.45527803897857666, + "num_tokens": 1954621390.0, + "step": 3824 + }, + { + "epoch": 1.0343428880475933, + "grad_norm": 2.859375, + "learning_rate": 0.01856437819555808, + "loss": 3.2274, + "mean_token_accuracy": 0.39130687713623047, + "num_tokens": 1955085655.0, + "step": 3825 + }, + { + "epoch": 1.034613304488913, + "grad_norm": 2.640625, + "learning_rate": 0.018563523886365597, + "loss": 3.3283, + "mean_token_accuracy": 0.3721351623535156, + "num_tokens": 1955609924.0, + "step": 3826 + }, + { + "epoch": 1.0348837209302326, + "grad_norm": 2.65625, + "learning_rate": 0.01856266934510196, + "loss": 3.159, + "mean_token_accuracy": 0.39524051547050476, + "num_tokens": 1956095936.0, + "step": 3827 + }, + { + "epoch": 1.0351541373715523, + "grad_norm": 2.8125, + "learning_rate": 0.018561814571793386, + "loss": 3.2907, + "mean_token_accuracy": 0.3628503978252411, + "num_tokens": 1956620143.0, + "step": 3828 + }, + { + "epoch": 1.035424553812872, + "grad_norm": 3.359375, + "learning_rate": 0.018560959566466105, + "loss": 3.2136, + "mean_token_accuracy": 0.38541844487190247, + "num_tokens": 1957140747.0, + "step": 3829 + }, + { + "epoch": 1.0356949702541915, + "grad_norm": 3.21875, + "learning_rate": 0.01856010432914635, + "loss": 3.4182, + "mean_token_accuracy": 0.3508816957473755, + "num_tokens": 1957664843.0, + "step": 3830 + }, + { + "epoch": 1.0359653866955112, + "grad_norm": 2.375, + "learning_rate": 0.018559248859860357, + "loss": 10.5898, + "mean_token_accuracy": 0.00023110874462872744, + "num_tokens": 1958188968.0, + "step": 3831 + }, + { + "epoch": 1.0362358031368306, + "grad_norm": 6.09375, + "learning_rate": 0.018558393158634383, + "loss": 3.8872, + "mean_token_accuracy": 0.32352349162101746, + "num_tokens": 1958713243.0, + "step": 3832 + }, + { + "epoch": 1.0365062195781503, + "grad_norm": 4.34375, + "learning_rate": 0.018557537225494677, + "loss": 3.7633, + "mean_token_accuracy": 0.35295504331588745, + "num_tokens": 1959197236.0, + "step": 3833 + }, + { + "epoch": 1.03677663601947, + "grad_norm": 2.765625, + "learning_rate": 0.018556681060467504, + "loss": 3.4011, + "mean_token_accuracy": 0.3627176284790039, + "num_tokens": 1959721368.0, + "step": 3834 + }, + { + "epoch": 1.0370470524607895, + "grad_norm": 2.75, + "learning_rate": 0.01855582466357913, + "loss": 3.6623, + "mean_token_accuracy": 0.34553635120391846, + "num_tokens": 1960245620.0, + "step": 3835 + }, + { + "epoch": 1.0373174689021092, + "grad_norm": 3.296875, + "learning_rate": 0.018554968034855842, + "loss": 3.4911, + "mean_token_accuracy": 0.3561544418334961, + "num_tokens": 1960769837.0, + "step": 3836 + }, + { + "epoch": 1.0375878853434288, + "grad_norm": 3.484375, + "learning_rate": 0.018554111174323915, + "loss": 3.6628, + "mean_token_accuracy": 0.34617555141448975, + "num_tokens": 1961287700.0, + "step": 3837 + }, + { + "epoch": 1.0378583017847485, + "grad_norm": 2.921875, + "learning_rate": 0.018553254082009637, + "loss": 3.453, + "mean_token_accuracy": 0.3464104235172272, + "num_tokens": 1961811824.0, + "step": 3838 + }, + { + "epoch": 1.0381287182260681, + "grad_norm": 2.828125, + "learning_rate": 0.018552396757939314, + "loss": 3.4307, + "mean_token_accuracy": 0.3510691225528717, + "num_tokens": 1962335932.0, + "step": 3839 + }, + { + "epoch": 1.0383991346673878, + "grad_norm": 3.1875, + "learning_rate": 0.01855153920213925, + "loss": 3.5646, + "mean_token_accuracy": 0.35340631008148193, + "num_tokens": 1962803307.0, + "step": 3840 + }, + { + "epoch": 1.0386695511087074, + "grad_norm": 3.265625, + "learning_rate": 0.018550681414635756, + "loss": 3.6269, + "mean_token_accuracy": 0.3348807692527771, + "num_tokens": 1963327557.0, + "step": 3841 + }, + { + "epoch": 1.038939967550027, + "grad_norm": 2.84375, + "learning_rate": 0.018549823395455148, + "loss": 3.2258, + "mean_token_accuracy": 0.38379693031311035, + "num_tokens": 1963813645.0, + "step": 3842 + }, + { + "epoch": 1.0392103839913467, + "grad_norm": 2.171875, + "learning_rate": 0.018548965144623758, + "loss": 3.3607, + "mean_token_accuracy": 0.3680773973464966, + "num_tokens": 1964330513.0, + "step": 3843 + }, + { + "epoch": 1.0394808004326663, + "grad_norm": 3.171875, + "learning_rate": 0.018548106662167912, + "loss": 3.5064, + "mean_token_accuracy": 0.3622429370880127, + "num_tokens": 1964834174.0, + "step": 3844 + }, + { + "epoch": 1.039751216873986, + "grad_norm": 2.78125, + "learning_rate": 0.01854724794811396, + "loss": 3.2767, + "mean_token_accuracy": 0.3782198429107666, + "num_tokens": 1965358352.0, + "step": 3845 + }, + { + "epoch": 1.0400216333153056, + "grad_norm": 2.5625, + "learning_rate": 0.01854638900248825, + "loss": 3.4451, + "mean_token_accuracy": 0.3826730251312256, + "num_tokens": 1965863905.0, + "step": 3846 + }, + { + "epoch": 1.0402920497566253, + "grad_norm": 2.34375, + "learning_rate": 0.01854552982531713, + "loss": 3.13, + "mean_token_accuracy": 0.3808392882347107, + "num_tokens": 1966388158.0, + "step": 3847 + }, + { + "epoch": 1.040562466197945, + "grad_norm": 2.953125, + "learning_rate": 0.018544670416626965, + "loss": 3.364, + "mean_token_accuracy": 0.372745156288147, + "num_tokens": 1966912431.0, + "step": 3848 + }, + { + "epoch": 1.0408328826392645, + "grad_norm": 3.25, + "learning_rate": 0.01854381077644412, + "loss": 3.164, + "mean_token_accuracy": 0.37661224603652954, + "num_tokens": 1967436656.0, + "step": 3849 + }, + { + "epoch": 1.0411032990805842, + "grad_norm": 2.84375, + "learning_rate": 0.01854295090479498, + "loss": 3.4093, + "mean_token_accuracy": 0.38319963216781616, + "num_tokens": 1967960837.0, + "step": 3850 + }, + { + "epoch": 1.0413737155219038, + "grad_norm": 96.0, + "learning_rate": 0.018542090801705925, + "loss": 24.2114, + "mean_token_accuracy": 0.04496752470731735, + "num_tokens": 1968485010.0, + "step": 3851 + }, + { + "epoch": 1.0416441319632233, + "grad_norm": 10.375, + "learning_rate": 0.01854123046720334, + "loss": 4.2853, + "mean_token_accuracy": 0.28412652015686035, + "num_tokens": 1969009209.0, + "step": 3852 + }, + { + "epoch": 1.041914548404543, + "grad_norm": 2.546875, + "learning_rate": 0.018540369901313632, + "loss": 3.4912, + "mean_token_accuracy": 0.34632939100265503, + "num_tokens": 1969533484.0, + "step": 3853 + }, + { + "epoch": 1.0421849648458625, + "grad_norm": 2.1875, + "learning_rate": 0.018539509104063198, + "loss": 3.4438, + "mean_token_accuracy": 0.37353062629699707, + "num_tokens": 1970029406.0, + "step": 3854 + }, + { + "epoch": 1.0424553812871822, + "grad_norm": 3.421875, + "learning_rate": 0.018538648075478448, + "loss": 3.5838, + "mean_token_accuracy": 0.3490232825279236, + "num_tokens": 1970553670.0, + "step": 3855 + }, + { + "epoch": 1.0427257977285018, + "grad_norm": 5.75, + "learning_rate": 0.018537786815585815, + "loss": 3.2901, + "mean_token_accuracy": 0.3950842618942261, + "num_tokens": 1971017370.0, + "step": 3856 + }, + { + "epoch": 1.0429962141698215, + "grad_norm": 1.96875, + "learning_rate": 0.018536925324411705, + "loss": 3.1451, + "mean_token_accuracy": 0.40237778425216675, + "num_tokens": 1971487881.0, + "step": 3857 + }, + { + "epoch": 1.043266630611141, + "grad_norm": 3.421875, + "learning_rate": 0.01853606360198257, + "loss": 3.1156, + "mean_token_accuracy": 0.4016515016555786, + "num_tokens": 1971974800.0, + "step": 3858 + }, + { + "epoch": 1.0435370470524608, + "grad_norm": 2.015625, + "learning_rate": 0.018535201648324836, + "loss": 3.3093, + "mean_token_accuracy": 0.3893791437149048, + "num_tokens": 1972499048.0, + "step": 3859 + }, + { + "epoch": 1.0438074634937804, + "grad_norm": 3.375, + "learning_rate": 0.018534339463464958, + "loss": 3.5472, + "mean_token_accuracy": 0.36257654428482056, + "num_tokens": 1973023219.0, + "step": 3860 + }, + { + "epoch": 1.0440778799351, + "grad_norm": 2.796875, + "learning_rate": 0.018533477047429386, + "loss": 3.1908, + "mean_token_accuracy": 0.36990445852279663, + "num_tokens": 1973547397.0, + "step": 3861 + }, + { + "epoch": 1.0443482963764197, + "grad_norm": 3.375, + "learning_rate": 0.018532614400244587, + "loss": 3.2129, + "mean_token_accuracy": 0.3774058520793915, + "num_tokens": 1974071541.0, + "step": 3862 + }, + { + "epoch": 1.0446187128177393, + "grad_norm": 2.9375, + "learning_rate": 0.018531751521937024, + "loss": 3.3794, + "mean_token_accuracy": 0.3652260899543762, + "num_tokens": 1974568369.0, + "step": 3863 + }, + { + "epoch": 1.044889129259059, + "grad_norm": 4.5, + "learning_rate": 0.018530888412533177, + "loss": 3.5747, + "mean_token_accuracy": 0.36140280961990356, + "num_tokens": 1975092601.0, + "step": 3864 + }, + { + "epoch": 1.0451595457003786, + "grad_norm": 3.28125, + "learning_rate": 0.018530025072059524, + "loss": 3.3904, + "mean_token_accuracy": 0.3771563470363617, + "num_tokens": 1975606459.0, + "step": 3865 + }, + { + "epoch": 1.0454299621416983, + "grad_norm": 4.0625, + "learning_rate": 0.01852916150054256, + "loss": 3.4506, + "mean_token_accuracy": 0.34802061319351196, + "num_tokens": 1976130704.0, + "step": 3866 + }, + { + "epoch": 1.045700378583018, + "grad_norm": 2.828125, + "learning_rate": 0.01852829769800878, + "loss": 3.5233, + "mean_token_accuracy": 0.36739063262939453, + "num_tokens": 1976654985.0, + "step": 3867 + }, + { + "epoch": 1.0459707950243375, + "grad_norm": 3.09375, + "learning_rate": 0.018527433664484682, + "loss": 3.3969, + "mean_token_accuracy": 0.3532310128211975, + "num_tokens": 1977179038.0, + "step": 3868 + }, + { + "epoch": 1.0462412114656572, + "grad_norm": 3.03125, + "learning_rate": 0.018526569399996785, + "loss": 3.345, + "mean_token_accuracy": 0.3982172906398773, + "num_tokens": 1977671984.0, + "step": 3869 + }, + { + "epoch": 1.0465116279069768, + "grad_norm": 3.015625, + "learning_rate": 0.01852570490457161, + "loss": 3.1692, + "mean_token_accuracy": 0.4004689157009125, + "num_tokens": 1978139429.0, + "step": 3870 + }, + { + "epoch": 1.0467820443482965, + "grad_norm": 3.34375, + "learning_rate": 0.01852484017823567, + "loss": 10.4815, + "mean_token_accuracy": 3.934563574148342e-05, + "num_tokens": 1978657877.0, + "step": 3871 + }, + { + "epoch": 1.0470524607896161, + "grad_norm": 14.1875, + "learning_rate": 0.018523975221015506, + "loss": 3.8703, + "mean_token_accuracy": 0.35938772559165955, + "num_tokens": 1979182053.0, + "step": 3872 + }, + { + "epoch": 1.0473228772309355, + "grad_norm": 3.421875, + "learning_rate": 0.01852311003293766, + "loss": 3.2602, + "mean_token_accuracy": 0.42481517791748047, + "num_tokens": 1979706066.0, + "step": 3873 + }, + { + "epoch": 1.0475932936722552, + "grad_norm": 3.28125, + "learning_rate": 0.01852224461402867, + "loss": 2.8934, + "mean_token_accuracy": 0.41103076934814453, + "num_tokens": 1980192517.0, + "step": 3874 + }, + { + "epoch": 1.0478637101135748, + "grad_norm": 3.140625, + "learning_rate": 0.01852137896431509, + "loss": 3.6549, + "mean_token_accuracy": 0.3538625240325928, + "num_tokens": 1980716797.0, + "step": 3875 + }, + { + "epoch": 1.0481341265548945, + "grad_norm": 2.90625, + "learning_rate": 0.01852051308382349, + "loss": 3.3634, + "mean_token_accuracy": 0.35900968313217163, + "num_tokens": 1981240896.0, + "step": 3876 + }, + { + "epoch": 1.048404542996214, + "grad_norm": 2.859375, + "learning_rate": 0.01851964697258043, + "loss": 3.3818, + "mean_token_accuracy": 0.4021204113960266, + "num_tokens": 1981702547.0, + "step": 3877 + }, + { + "epoch": 1.0486749594375337, + "grad_norm": 3.15625, + "learning_rate": 0.018518780630612488, + "loss": 3.53, + "mean_token_accuracy": 0.34512221813201904, + "num_tokens": 1982201124.0, + "step": 3878 + }, + { + "epoch": 1.0489453758788534, + "grad_norm": 3.09375, + "learning_rate": 0.018517914057946245, + "loss": 3.5722, + "mean_token_accuracy": 0.3481650948524475, + "num_tokens": 1982725242.0, + "step": 3879 + }, + { + "epoch": 1.049215792320173, + "grad_norm": 2.4375, + "learning_rate": 0.01851704725460829, + "loss": 3.4169, + "mean_token_accuracy": 0.36095869541168213, + "num_tokens": 1983249490.0, + "step": 3880 + }, + { + "epoch": 1.0494862087614927, + "grad_norm": 3.375, + "learning_rate": 0.01851618022062522, + "loss": 3.324, + "mean_token_accuracy": 0.376179039478302, + "num_tokens": 1983773753.0, + "step": 3881 + }, + { + "epoch": 1.0497566252028123, + "grad_norm": 2.21875, + "learning_rate": 0.018515312956023634, + "loss": 3.4689, + "mean_token_accuracy": 0.3806186318397522, + "num_tokens": 1984280180.0, + "step": 3882 + }, + { + "epoch": 1.050027041644132, + "grad_norm": 2.828125, + "learning_rate": 0.01851444546083015, + "loss": 3.3796, + "mean_token_accuracy": 0.36219239234924316, + "num_tokens": 1984804420.0, + "step": 3883 + }, + { + "epoch": 1.0502974580854516, + "grad_norm": 2.546875, + "learning_rate": 0.018513577735071375, + "loss": 3.1621, + "mean_token_accuracy": 0.39101171493530273, + "num_tokens": 1985328703.0, + "step": 3884 + }, + { + "epoch": 1.0505678745267713, + "grad_norm": 3.359375, + "learning_rate": 0.018512709778773943, + "loss": 3.3836, + "mean_token_accuracy": 0.37761664390563965, + "num_tokens": 1985852841.0, + "step": 3885 + }, + { + "epoch": 1.050838290968091, + "grad_norm": 2.75, + "learning_rate": 0.01851184159196448, + "loss": 3.4368, + "mean_token_accuracy": 0.3497868478298187, + "num_tokens": 1986377057.0, + "step": 3886 + }, + { + "epoch": 1.0511087074094105, + "grad_norm": 2.96875, + "learning_rate": 0.018510973174669626, + "loss": 3.4189, + "mean_token_accuracy": 0.36471015214920044, + "num_tokens": 1986901327.0, + "step": 3887 + }, + { + "epoch": 1.0513791238507302, + "grad_norm": 3.0625, + "learning_rate": 0.018510104526916028, + "loss": 3.4493, + "mean_token_accuracy": 0.38914522528648376, + "num_tokens": 1987317112.0, + "step": 3888 + }, + { + "epoch": 1.0516495402920498, + "grad_norm": 1.9453125, + "learning_rate": 0.01850923564873034, + "loss": 3.1101, + "mean_token_accuracy": 0.37791746854782104, + "num_tokens": 1987841379.0, + "step": 3889 + }, + { + "epoch": 1.0519199567333695, + "grad_norm": 2.296875, + "learning_rate": 0.018508366540139213, + "loss": 3.2558, + "mean_token_accuracy": 0.38372868299484253, + "num_tokens": 1988365504.0, + "step": 3890 + }, + { + "epoch": 1.052190373174689, + "grad_norm": 29.25, + "learning_rate": 0.018507497201169322, + "loss": 12.2343, + "mean_token_accuracy": 0.03004508465528488, + "num_tokens": 1988859604.0, + "step": 3891 + }, + { + "epoch": 1.0524607896160088, + "grad_norm": 7.96875, + "learning_rate": 0.01850662763184734, + "loss": 4.0237, + "mean_token_accuracy": 0.30287277698516846, + "num_tokens": 1989326408.0, + "step": 3892 + }, + { + "epoch": 1.0527312060573282, + "grad_norm": 7.0, + "learning_rate": 0.018505757832199946, + "loss": 3.3094, + "mean_token_accuracy": 0.3795322775840759, + "num_tokens": 1989829055.0, + "step": 3893 + }, + { + "epoch": 1.0530016224986478, + "grad_norm": 2.703125, + "learning_rate": 0.01850488780225383, + "loss": 3.6107, + "mean_token_accuracy": 0.3559480905532837, + "num_tokens": 1990353166.0, + "step": 3894 + }, + { + "epoch": 1.0532720389399675, + "grad_norm": 2.34375, + "learning_rate": 0.018504017542035685, + "loss": 3.2896, + "mean_token_accuracy": 0.3695923686027527, + "num_tokens": 1990865070.0, + "step": 3895 + }, + { + "epoch": 1.053542455381287, + "grad_norm": 3.53125, + "learning_rate": 0.018503147051572213, + "loss": 3.4985, + "mean_token_accuracy": 0.3708294928073883, + "num_tokens": 1991389258.0, + "step": 3896 + }, + { + "epoch": 1.0538128718226067, + "grad_norm": 4.34375, + "learning_rate": 0.018502276330890126, + "loss": 3.47, + "mean_token_accuracy": 0.34198734164237976, + "num_tokens": 1991913478.0, + "step": 3897 + }, + { + "epoch": 1.0540832882639264, + "grad_norm": 4.0625, + "learning_rate": 0.018501405380016135, + "loss": 3.7115, + "mean_token_accuracy": 0.354114830493927, + "num_tokens": 1992437734.0, + "step": 3898 + }, + { + "epoch": 1.054353704705246, + "grad_norm": 4.3125, + "learning_rate": 0.01850053419897697, + "loss": 3.5188, + "mean_token_accuracy": 0.3554876446723938, + "num_tokens": 1992961480.0, + "step": 3899 + }, + { + "epoch": 1.0546241211465657, + "grad_norm": 2.5, + "learning_rate": 0.018499662787799354, + "loss": 3.5729, + "mean_token_accuracy": 0.3727007508277893, + "num_tokens": 1993485571.0, + "step": 3900 + }, + { + "epoch": 1.0548945375878853, + "grad_norm": 3.5, + "learning_rate": 0.018498791146510034, + "loss": 3.5579, + "mean_token_accuracy": 0.3725489377975464, + "num_tokens": 1994009773.0, + "step": 3901 + }, + { + "epoch": 1.055164954029205, + "grad_norm": 2.953125, + "learning_rate": 0.01849791927513575, + "loss": 3.4188, + "mean_token_accuracy": 0.37768179178237915, + "num_tokens": 1994503993.0, + "step": 3902 + }, + { + "epoch": 1.0554353704705246, + "grad_norm": 3.140625, + "learning_rate": 0.018497047173703245, + "loss": 3.3222, + "mean_token_accuracy": 0.37245064973831177, + "num_tokens": 1995028259.0, + "step": 3903 + }, + { + "epoch": 1.0557057869118442, + "grad_norm": 3.203125, + "learning_rate": 0.01849617484223929, + "loss": 3.1823, + "mean_token_accuracy": 0.3789549469947815, + "num_tokens": 1995552455.0, + "step": 3904 + }, + { + "epoch": 1.055976203353164, + "grad_norm": 3.3125, + "learning_rate": 0.018495302280770648, + "loss": 3.3102, + "mean_token_accuracy": 0.37967729568481445, + "num_tokens": 1996076576.0, + "step": 3905 + }, + { + "epoch": 1.0562466197944835, + "grad_norm": 2.859375, + "learning_rate": 0.018494429489324088, + "loss": 3.2836, + "mean_token_accuracy": 0.38006624579429626, + "num_tokens": 1996582121.0, + "step": 3906 + }, + { + "epoch": 1.0565170362358032, + "grad_norm": 3.21875, + "learning_rate": 0.01849355646792639, + "loss": 3.5629, + "mean_token_accuracy": 0.37956345081329346, + "num_tokens": 1997082319.0, + "step": 3907 + }, + { + "epoch": 1.0567874526771228, + "grad_norm": 3.171875, + "learning_rate": 0.018492683216604345, + "loss": 3.4604, + "mean_token_accuracy": 0.3598567247390747, + "num_tokens": 1997606601.0, + "step": 3908 + }, + { + "epoch": 1.0570578691184425, + "grad_norm": 2.6875, + "learning_rate": 0.01849180973538474, + "loss": 3.4208, + "mean_token_accuracy": 0.38250041007995605, + "num_tokens": 1998124722.0, + "step": 3909 + }, + { + "epoch": 1.057328285559762, + "grad_norm": 3.0625, + "learning_rate": 0.018490936024294385, + "loss": 3.29, + "mean_token_accuracy": 0.37862804532051086, + "num_tokens": 1998648866.0, + "step": 3910 + }, + { + "epoch": 1.0575987020010817, + "grad_norm": 2.453125, + "learning_rate": 0.01849006208336008, + "loss": 11.0466, + "mean_token_accuracy": 6.917351583979325e-06, + "num_tokens": 1999172929.0, + "step": 3911 + }, + { + "epoch": 1.0578691184424014, + "grad_norm": 7.0625, + "learning_rate": 0.018489187912608646, + "loss": 4.1941, + "mean_token_accuracy": 0.26304832100868225, + "num_tokens": 1999697018.0, + "step": 3912 + }, + { + "epoch": 1.058139534883721, + "grad_norm": 2.078125, + "learning_rate": 0.018488313512066902, + "loss": 3.6204, + "mean_token_accuracy": 0.36408430337905884, + "num_tokens": 2000189918.0, + "step": 3913 + }, + { + "epoch": 1.0584099513250407, + "grad_norm": 2.484375, + "learning_rate": 0.01848743888176168, + "loss": 3.634, + "mean_token_accuracy": 0.3764592707157135, + "num_tokens": 2000653553.0, + "step": 3914 + }, + { + "epoch": 1.05868036776636, + "grad_norm": 3.328125, + "learning_rate": 0.01848656402171981, + "loss": 3.3767, + "mean_token_accuracy": 0.36944085359573364, + "num_tokens": 2001143388.0, + "step": 3915 + }, + { + "epoch": 1.0589507842076797, + "grad_norm": 2.59375, + "learning_rate": 0.01848568893196814, + "loss": 3.4337, + "mean_token_accuracy": 0.36677759885787964, + "num_tokens": 2001667655.0, + "step": 3916 + }, + { + "epoch": 1.0592212006489994, + "grad_norm": 3.390625, + "learning_rate": 0.01848481361253352, + "loss": 3.4503, + "mean_token_accuracy": 0.38271886110305786, + "num_tokens": 2002191935.0, + "step": 3917 + }, + { + "epoch": 1.059491617090319, + "grad_norm": 2.515625, + "learning_rate": 0.01848393806344281, + "loss": 3.2445, + "mean_token_accuracy": 0.3703955113887787, + "num_tokens": 2002716208.0, + "step": 3918 + }, + { + "epoch": 1.0597620335316387, + "grad_norm": 2.265625, + "learning_rate": 0.018483062284722865, + "loss": 3.2317, + "mean_token_accuracy": 0.39120084047317505, + "num_tokens": 2003240474.0, + "step": 3919 + }, + { + "epoch": 1.0600324499729583, + "grad_norm": 2.796875, + "learning_rate": 0.018482186276400567, + "loss": 3.4064, + "mean_token_accuracy": 0.36435937881469727, + "num_tokens": 2003764726.0, + "step": 3920 + }, + { + "epoch": 1.060302866414278, + "grad_norm": 3.109375, + "learning_rate": 0.01848131003850279, + "loss": 3.4881, + "mean_token_accuracy": 0.3550116717815399, + "num_tokens": 2004288918.0, + "step": 3921 + }, + { + "epoch": 1.0605732828555976, + "grad_norm": 3.234375, + "learning_rate": 0.018480433571056422, + "loss": 3.4345, + "mean_token_accuracy": 0.3611399531364441, + "num_tokens": 2004813130.0, + "step": 3922 + }, + { + "epoch": 1.0608436992969172, + "grad_norm": 3.4375, + "learning_rate": 0.01847955687408835, + "loss": 3.2602, + "mean_token_accuracy": 0.39182931184768677, + "num_tokens": 2005337236.0, + "step": 3923 + }, + { + "epoch": 1.0611141157382369, + "grad_norm": 3.0625, + "learning_rate": 0.018478679947625476, + "loss": 3.3714, + "mean_token_accuracy": 0.37420713901519775, + "num_tokens": 2005847809.0, + "step": 3924 + }, + { + "epoch": 1.0613845321795565, + "grad_norm": 3.09375, + "learning_rate": 0.01847780279169471, + "loss": 3.1705, + "mean_token_accuracy": 0.39432141184806824, + "num_tokens": 2006372085.0, + "step": 3925 + }, + { + "epoch": 1.0616549486208762, + "grad_norm": 2.46875, + "learning_rate": 0.01847692540632297, + "loss": 3.1648, + "mean_token_accuracy": 0.3886435627937317, + "num_tokens": 2006896328.0, + "step": 3926 + }, + { + "epoch": 1.0619253650621958, + "grad_norm": 2.75, + "learning_rate": 0.01847604779153716, + "loss": 3.4335, + "mean_token_accuracy": 0.37190937995910645, + "num_tokens": 2007420549.0, + "step": 3927 + }, + { + "epoch": 1.0621957815035155, + "grad_norm": 3.453125, + "learning_rate": 0.018475169947364225, + "loss": 3.1717, + "mean_token_accuracy": 0.40011489391326904, + "num_tokens": 2007944726.0, + "step": 3928 + }, + { + "epoch": 1.062466197944835, + "grad_norm": 3.625, + "learning_rate": 0.018474291873831088, + "loss": 3.2589, + "mean_token_accuracy": 0.3611024022102356, + "num_tokens": 2008468857.0, + "step": 3929 + }, + { + "epoch": 1.0627366143861547, + "grad_norm": 2.71875, + "learning_rate": 0.0184734135709647, + "loss": 3.4817, + "mean_token_accuracy": 0.3465462923049927, + "num_tokens": 2008993129.0, + "step": 3930 + }, + { + "epoch": 1.0630070308274744, + "grad_norm": 7.1875, + "learning_rate": 0.018472535038792003, + "loss": 8.9154, + "mean_token_accuracy": 0.01469402201473713, + "num_tokens": 2009517353.0, + "step": 3931 + }, + { + "epoch": 1.063277447268794, + "grad_norm": 8.4375, + "learning_rate": 0.018471656277339956, + "loss": 4.4828, + "mean_token_accuracy": 0.23396335542201996, + "num_tokens": 2010041630.0, + "step": 3932 + }, + { + "epoch": 1.0635478637101137, + "grad_norm": 2.6875, + "learning_rate": 0.01847077728663552, + "loss": 3.4692, + "mean_token_accuracy": 0.3278118968009949, + "num_tokens": 2010565840.0, + "step": 3933 + }, + { + "epoch": 1.063818280151433, + "grad_norm": 3.59375, + "learning_rate": 0.018469898066705674, + "loss": 3.4616, + "mean_token_accuracy": 0.3578178286552429, + "num_tokens": 2011090094.0, + "step": 3934 + }, + { + "epoch": 1.0640886965927527, + "grad_norm": 3.71875, + "learning_rate": 0.01846901861757738, + "loss": 3.348, + "mean_token_accuracy": 0.35514935851097107, + "num_tokens": 2011614343.0, + "step": 3935 + }, + { + "epoch": 1.0643591130340724, + "grad_norm": 2.40625, + "learning_rate": 0.018468138939277633, + "loss": 3.5508, + "mean_token_accuracy": 0.3472944498062134, + "num_tokens": 2012138520.0, + "step": 3936 + }, + { + "epoch": 1.064629529475392, + "grad_norm": 5.28125, + "learning_rate": 0.01846725903183342, + "loss": 3.8666, + "mean_token_accuracy": 0.2987545132637024, + "num_tokens": 2012662734.0, + "step": 3937 + }, + { + "epoch": 1.0648999459167117, + "grad_norm": 2.1875, + "learning_rate": 0.01846637889527174, + "loss": 3.2241, + "mean_token_accuracy": 0.37421274185180664, + "num_tokens": 2013186938.0, + "step": 3938 + }, + { + "epoch": 1.0651703623580313, + "grad_norm": 4.28125, + "learning_rate": 0.018465498529619602, + "loss": 3.4597, + "mean_token_accuracy": 0.3602778911590576, + "num_tokens": 2013698388.0, + "step": 3939 + }, + { + "epoch": 1.065440778799351, + "grad_norm": 2.34375, + "learning_rate": 0.01846461793490401, + "loss": 3.2948, + "mean_token_accuracy": 0.37307748198509216, + "num_tokens": 2014222631.0, + "step": 3940 + }, + { + "epoch": 1.0657111952406706, + "grad_norm": 3.796875, + "learning_rate": 0.01846373711115199, + "loss": 3.6193, + "mean_token_accuracy": 0.3629024028778076, + "num_tokens": 2014746647.0, + "step": 3941 + }, + { + "epoch": 1.0659816116819902, + "grad_norm": 3.703125, + "learning_rate": 0.018462856058390566, + "loss": 3.5734, + "mean_token_accuracy": 0.36336469650268555, + "num_tokens": 2015214382.0, + "step": 3942 + }, + { + "epoch": 1.0662520281233099, + "grad_norm": 2.84375, + "learning_rate": 0.01846197477664677, + "loss": 3.5878, + "mean_token_accuracy": 0.3526044487953186, + "num_tokens": 2015738664.0, + "step": 3943 + }, + { + "epoch": 1.0665224445646295, + "grad_norm": 2.8125, + "learning_rate": 0.018461093265947644, + "loss": 3.4175, + "mean_token_accuracy": 0.3631836771965027, + "num_tokens": 2016262645.0, + "step": 3944 + }, + { + "epoch": 1.0667928610059492, + "grad_norm": 2.578125, + "learning_rate": 0.018460211526320235, + "loss": 3.4117, + "mean_token_accuracy": 0.38902029395103455, + "num_tokens": 2016786757.0, + "step": 3945 + }, + { + "epoch": 1.0670632774472688, + "grad_norm": 3.671875, + "learning_rate": 0.018459329557791598, + "loss": 3.1611, + "mean_token_accuracy": 0.39229875802993774, + "num_tokens": 2017310940.0, + "step": 3946 + }, + { + "epoch": 1.0673336938885885, + "grad_norm": 3.078125, + "learning_rate": 0.01845844736038879, + "loss": 3.133, + "mean_token_accuracy": 0.3985483646392822, + "num_tokens": 2017835096.0, + "step": 3947 + }, + { + "epoch": 1.067604110329908, + "grad_norm": 2.8125, + "learning_rate": 0.018457564934138887, + "loss": 3.2457, + "mean_token_accuracy": 0.3859151303768158, + "num_tokens": 2018359243.0, + "step": 3948 + }, + { + "epoch": 1.0678745267712277, + "grad_norm": 2.53125, + "learning_rate": 0.018456682279068964, + "loss": 3.333, + "mean_token_accuracy": 0.38596343994140625, + "num_tokens": 2018883423.0, + "step": 3949 + }, + { + "epoch": 1.0681449432125474, + "grad_norm": 2.390625, + "learning_rate": 0.018455799395206095, + "loss": 3.3423, + "mean_token_accuracy": 0.3883207440376282, + "num_tokens": 2019400842.0, + "step": 3950 + }, + { + "epoch": 1.068415359653867, + "grad_norm": 3.5, + "learning_rate": 0.018454916282577376, + "loss": 10.9111, + "mean_token_accuracy": 2.5008503143908456e-06, + "num_tokens": 2019853933.0, + "step": 3951 + }, + { + "epoch": 1.0686857760951867, + "grad_norm": 39.25, + "learning_rate": 0.0184540329412099, + "loss": 3.9696, + "mean_token_accuracy": 0.28484490513801575, + "num_tokens": 2020378102.0, + "step": 3952 + }, + { + "epoch": 1.0689561925365063, + "grad_norm": 5.28125, + "learning_rate": 0.018453149371130774, + "loss": 4.2816, + "mean_token_accuracy": 0.27456870675086975, + "num_tokens": 2020902357.0, + "step": 3953 + }, + { + "epoch": 1.069226608977826, + "grad_norm": 3.125, + "learning_rate": 0.018452265572367108, + "loss": 3.4536, + "mean_token_accuracy": 0.3627322316169739, + "num_tokens": 2021408174.0, + "step": 3954 + }, + { + "epoch": 1.0694970254191456, + "grad_norm": 5.78125, + "learning_rate": 0.018451381544946018, + "loss": 3.7901, + "mean_token_accuracy": 0.3538777828216553, + "num_tokens": 2021932397.0, + "step": 3955 + }, + { + "epoch": 1.069767441860465, + "grad_norm": 3.59375, + "learning_rate": 0.01845049728889463, + "loss": 3.4898, + "mean_token_accuracy": 0.31369224190711975, + "num_tokens": 2022456613.0, + "step": 3956 + }, + { + "epoch": 1.0700378583017847, + "grad_norm": 3.15625, + "learning_rate": 0.01844961280424007, + "loss": 3.5117, + "mean_token_accuracy": 0.36999136209487915, + "num_tokens": 2022950263.0, + "step": 3957 + }, + { + "epoch": 1.0703082747431043, + "grad_norm": 3.484375, + "learning_rate": 0.018448728091009486, + "loss": 3.5593, + "mean_token_accuracy": 0.3488626778125763, + "num_tokens": 2023442311.0, + "step": 3958 + }, + { + "epoch": 1.070578691184424, + "grad_norm": 2.296875, + "learning_rate": 0.018447843149230016, + "loss": 3.3177, + "mean_token_accuracy": 0.3806909918785095, + "num_tokens": 2023966550.0, + "step": 3959 + }, + { + "epoch": 1.0708491076257436, + "grad_norm": 2.765625, + "learning_rate": 0.018446957978928816, + "loss": 3.2667, + "mean_token_accuracy": 0.3751642107963562, + "num_tokens": 2024490812.0, + "step": 3960 + }, + { + "epoch": 1.0711195240670632, + "grad_norm": 3.0625, + "learning_rate": 0.018446072580133047, + "loss": 3.3112, + "mean_token_accuracy": 0.3765072226524353, + "num_tokens": 2025014905.0, + "step": 3961 + }, + { + "epoch": 1.0713899405083829, + "grad_norm": 3.046875, + "learning_rate": 0.018445186952869876, + "loss": 3.53, + "mean_token_accuracy": 0.35629862546920776, + "num_tokens": 2025539113.0, + "step": 3962 + }, + { + "epoch": 1.0716603569497025, + "grad_norm": 2.5625, + "learning_rate": 0.01844430109716647, + "loss": 3.2344, + "mean_token_accuracy": 0.38279348611831665, + "num_tokens": 2026063367.0, + "step": 3963 + }, + { + "epoch": 1.0719307733910222, + "grad_norm": 3.09375, + "learning_rate": 0.018443415013050014, + "loss": 3.1826, + "mean_token_accuracy": 0.38232865929603577, + "num_tokens": 2026548671.0, + "step": 3964 + }, + { + "epoch": 1.0722011898323418, + "grad_norm": 4.34375, + "learning_rate": 0.018442528700547696, + "loss": 3.5598, + "mean_token_accuracy": 0.35326701402664185, + "num_tokens": 2027072864.0, + "step": 3965 + }, + { + "epoch": 1.0724716062736614, + "grad_norm": 4.15625, + "learning_rate": 0.018441642159686713, + "loss": 3.6181, + "mean_token_accuracy": 0.37319469451904297, + "num_tokens": 2027569439.0, + "step": 3966 + }, + { + "epoch": 1.072742022714981, + "grad_norm": 2.4375, + "learning_rate": 0.01844075539049426, + "loss": 3.2889, + "mean_token_accuracy": 0.39225706458091736, + "num_tokens": 2028033526.0, + "step": 3967 + }, + { + "epoch": 1.0730124391563007, + "grad_norm": 3.421875, + "learning_rate": 0.018439868392997553, + "loss": 3.4626, + "mean_token_accuracy": 0.36756327748298645, + "num_tokens": 2028557696.0, + "step": 3968 + }, + { + "epoch": 1.0732828555976204, + "grad_norm": 2.4375, + "learning_rate": 0.018438981167223804, + "loss": 3.3279, + "mean_token_accuracy": 0.37248727679252625, + "num_tokens": 2029081736.0, + "step": 3969 + }, + { + "epoch": 1.07355327203894, + "grad_norm": 3.375, + "learning_rate": 0.018438093713200238, + "loss": 3.4596, + "mean_token_accuracy": 0.36864161491394043, + "num_tokens": 2029605907.0, + "step": 3970 + }, + { + "epoch": 1.0738236884802597, + "grad_norm": 52.5, + "learning_rate": 0.01843720603095408, + "loss": 13.4298, + "mean_token_accuracy": 0.040266167372465134, + "num_tokens": 2030129917.0, + "step": 3971 + }, + { + "epoch": 1.0740941049215793, + "grad_norm": 7.6875, + "learning_rate": 0.01843631812051257, + "loss": 3.948, + "mean_token_accuracy": 0.2668030261993408, + "num_tokens": 2030642211.0, + "step": 3972 + }, + { + "epoch": 1.074364521362899, + "grad_norm": 2.703125, + "learning_rate": 0.01843542998190295, + "loss": 3.4392, + "mean_token_accuracy": 0.3639989495277405, + "num_tokens": 2031163602.0, + "step": 3973 + }, + { + "epoch": 1.0746349378042186, + "grad_norm": 2.90625, + "learning_rate": 0.018434541615152476, + "loss": 3.3601, + "mean_token_accuracy": 0.378395676612854, + "num_tokens": 2031687875.0, + "step": 3974 + }, + { + "epoch": 1.074905354245538, + "grad_norm": 2.34375, + "learning_rate": 0.0184336530202884, + "loss": 3.3598, + "mean_token_accuracy": 0.3717283010482788, + "num_tokens": 2032212135.0, + "step": 3975 + }, + { + "epoch": 1.0751757706868577, + "grad_norm": 3.03125, + "learning_rate": 0.018432764197337987, + "loss": 3.2995, + "mean_token_accuracy": 0.37460410594940186, + "num_tokens": 2032736289.0, + "step": 3976 + }, + { + "epoch": 1.0754461871281773, + "grad_norm": 3.609375, + "learning_rate": 0.01843187514632851, + "loss": 3.4877, + "mean_token_accuracy": 0.36919283866882324, + "num_tokens": 2033260454.0, + "step": 3977 + }, + { + "epoch": 1.075716603569497, + "grad_norm": 3.515625, + "learning_rate": 0.01843098586728725, + "loss": 3.3563, + "mean_token_accuracy": 0.3627685308456421, + "num_tokens": 2033784517.0, + "step": 3978 + }, + { + "epoch": 1.0759870200108166, + "grad_norm": 2.140625, + "learning_rate": 0.01843009636024149, + "loss": 3.2562, + "mean_token_accuracy": 0.38664698600769043, + "num_tokens": 2034274541.0, + "step": 3979 + }, + { + "epoch": 1.0762574364521362, + "grad_norm": 7.5625, + "learning_rate": 0.018429206625218523, + "loss": 3.6435, + "mean_token_accuracy": 0.324308842420578, + "num_tokens": 2034798802.0, + "step": 3980 + }, + { + "epoch": 1.0765278528934559, + "grad_norm": 2.078125, + "learning_rate": 0.018428316662245647, + "loss": 3.2715, + "mean_token_accuracy": 0.36971479654312134, + "num_tokens": 2035322975.0, + "step": 3981 + }, + { + "epoch": 1.0767982693347755, + "grad_norm": 3.34375, + "learning_rate": 0.01842742647135017, + "loss": 3.4737, + "mean_token_accuracy": 0.3659774363040924, + "num_tokens": 2035847240.0, + "step": 3982 + }, + { + "epoch": 1.0770686857760952, + "grad_norm": 4.15625, + "learning_rate": 0.018426536052559412, + "loss": 3.0125, + "mean_token_accuracy": 0.41664958000183105, + "num_tokens": 2036371452.0, + "step": 3983 + }, + { + "epoch": 1.0773391022174148, + "grad_norm": 1.765625, + "learning_rate": 0.018425645405900682, + "loss": 3.2121, + "mean_token_accuracy": 0.38561272621154785, + "num_tokens": 2036895697.0, + "step": 3984 + }, + { + "epoch": 1.0776095186587344, + "grad_norm": 3.296875, + "learning_rate": 0.018424754531401317, + "loss": 3.3927, + "mean_token_accuracy": 0.36515313386917114, + "num_tokens": 2037419963.0, + "step": 3985 + }, + { + "epoch": 1.077879935100054, + "grad_norm": 2.3125, + "learning_rate": 0.018423863429088642, + "loss": 3.3676, + "mean_token_accuracy": 0.37914034724235535, + "num_tokens": 2037944179.0, + "step": 3986 + }, + { + "epoch": 1.0781503515413737, + "grad_norm": 3.5, + "learning_rate": 0.01842297209899001, + "loss": 3.4846, + "mean_token_accuracy": 0.334722101688385, + "num_tokens": 2038468449.0, + "step": 3987 + }, + { + "epoch": 1.0784207679826934, + "grad_norm": 2.265625, + "learning_rate": 0.018422080541132767, + "loss": 3.1621, + "mean_token_accuracy": 0.3838191032409668, + "num_tokens": 2038992730.0, + "step": 3988 + }, + { + "epoch": 1.078691184424013, + "grad_norm": 3.0625, + "learning_rate": 0.018421188755544264, + "loss": 3.3322, + "mean_token_accuracy": 0.376316636800766, + "num_tokens": 2039516795.0, + "step": 3989 + }, + { + "epoch": 1.0789616008653327, + "grad_norm": 3.1875, + "learning_rate": 0.01842029674225186, + "loss": 3.1875, + "mean_token_accuracy": 0.38056814670562744, + "num_tokens": 2040041051.0, + "step": 3990 + }, + { + "epoch": 1.0792320173066523, + "grad_norm": 2.40625, + "learning_rate": 0.018419404501282936, + "loss": 11.2197, + "mean_token_accuracy": 2.528010327296215e-06, + "num_tokens": 2040531391.0, + "step": 3991 + }, + { + "epoch": 1.079502433747972, + "grad_norm": 11.1875, + "learning_rate": 0.018418512032664865, + "loss": 3.8804, + "mean_token_accuracy": 0.337253212928772, + "num_tokens": 2041032379.0, + "step": 3992 + }, + { + "epoch": 1.0797728501892916, + "grad_norm": 3.8125, + "learning_rate": 0.018417619336425023, + "loss": 3.6115, + "mean_token_accuracy": 0.35904812812805176, + "num_tokens": 2041494298.0, + "step": 3993 + }, + { + "epoch": 1.0800432666306112, + "grad_norm": 4.875, + "learning_rate": 0.018416726412590807, + "loss": 3.8233, + "mean_token_accuracy": 0.3355976343154907, + "num_tokens": 2042018483.0, + "step": 3994 + }, + { + "epoch": 1.0803136830719309, + "grad_norm": 3.203125, + "learning_rate": 0.018415833261189615, + "loss": 3.3303, + "mean_token_accuracy": 0.3613124489784241, + "num_tokens": 2042542657.0, + "step": 3995 + }, + { + "epoch": 1.0805840995132505, + "grad_norm": 2.625, + "learning_rate": 0.018414939882248852, + "loss": 3.3975, + "mean_token_accuracy": 0.3723997175693512, + "num_tokens": 2043030424.0, + "step": 3996 + }, + { + "epoch": 1.08085451595457, + "grad_norm": 2.765625, + "learning_rate": 0.018414046275795924, + "loss": 3.3656, + "mean_token_accuracy": 0.3787510097026825, + "num_tokens": 2043554471.0, + "step": 3997 + }, + { + "epoch": 1.0811249323958896, + "grad_norm": 3.171875, + "learning_rate": 0.018413152441858253, + "loss": 3.3414, + "mean_token_accuracy": 0.3724767565727234, + "num_tokens": 2044078570.0, + "step": 3998 + }, + { + "epoch": 1.0813953488372092, + "grad_norm": 3.328125, + "learning_rate": 0.018412258380463266, + "loss": 3.2717, + "mean_token_accuracy": 0.37682196497917175, + "num_tokens": 2044602665.0, + "step": 3999 + }, + { + "epoch": 1.0816657652785289, + "grad_norm": 3.015625, + "learning_rate": 0.01841136409163839, + "loss": 3.2745, + "mean_token_accuracy": 0.3611205220222473, + "num_tokens": 2045126735.0, + "step": 4000 + }, + { + "epoch": 1.0819361817198485, + "grad_norm": 3.296875, + "learning_rate": 0.018410469575411075, + "loss": 3.4539, + "mean_token_accuracy": 0.37266266345977783, + "num_tokens": 2045651018.0, + "step": 4001 + }, + { + "epoch": 1.0822065981611682, + "grad_norm": 3.015625, + "learning_rate": 0.01840957483180876, + "loss": 3.5448, + "mean_token_accuracy": 0.36055994033813477, + "num_tokens": 2046175268.0, + "step": 4002 + }, + { + "epoch": 1.0824770146024878, + "grad_norm": 2.765625, + "learning_rate": 0.018408679860858895, + "loss": 3.2951, + "mean_token_accuracy": 0.3848532438278198, + "num_tokens": 2046699466.0, + "step": 4003 + }, + { + "epoch": 1.0827474310438074, + "grad_norm": 3.59375, + "learning_rate": 0.018407784662588945, + "loss": 3.2421, + "mean_token_accuracy": 0.3994351327419281, + "num_tokens": 2047099068.0, + "step": 4004 + }, + { + "epoch": 1.083017847485127, + "grad_norm": 2.625, + "learning_rate": 0.018406889237026377, + "loss": 3.168, + "mean_token_accuracy": 0.393767774105072, + "num_tokens": 2047623098.0, + "step": 4005 + }, + { + "epoch": 1.0832882639264467, + "grad_norm": 3.265625, + "learning_rate": 0.018405993584198668, + "loss": 3.4596, + "mean_token_accuracy": 0.36106809973716736, + "num_tokens": 2048127919.0, + "step": 4006 + }, + { + "epoch": 1.0835586803677664, + "grad_norm": 3.15625, + "learning_rate": 0.018405097704133292, + "loss": 3.4546, + "mean_token_accuracy": 0.37191224098205566, + "num_tokens": 2048652123.0, + "step": 4007 + }, + { + "epoch": 1.083829096809086, + "grad_norm": 2.984375, + "learning_rate": 0.018404201596857742, + "loss": 3.4881, + "mean_token_accuracy": 0.3673728108406067, + "num_tokens": 2049176344.0, + "step": 4008 + }, + { + "epoch": 1.0840995132504057, + "grad_norm": 2.828125, + "learning_rate": 0.018403305262399514, + "loss": 3.411, + "mean_token_accuracy": 0.3681400716304779, + "num_tokens": 2049700477.0, + "step": 4009 + }, + { + "epoch": 1.0843699296917253, + "grad_norm": 2.875, + "learning_rate": 0.01840240870078611, + "loss": 3.2292, + "mean_token_accuracy": 0.3941749334335327, + "num_tokens": 2050224738.0, + "step": 4010 + }, + { + "epoch": 1.084640346133045, + "grad_norm": 27.875, + "learning_rate": 0.018401511912045036, + "loss": 10.9763, + "mean_token_accuracy": 0.042528968304395676, + "num_tokens": 2050748881.0, + "step": 4011 + }, + { + "epoch": 1.0849107625743646, + "grad_norm": 6.375, + "learning_rate": 0.018400614896203808, + "loss": 3.874, + "mean_token_accuracy": 0.3110674023628235, + "num_tokens": 2051273039.0, + "step": 4012 + }, + { + "epoch": 1.0851811790156842, + "grad_norm": 2.4375, + "learning_rate": 0.018399717653289953, + "loss": 3.4383, + "mean_token_accuracy": 0.36096063256263733, + "num_tokens": 2051797270.0, + "step": 4013 + }, + { + "epoch": 1.0854515954570039, + "grad_norm": 3.171875, + "learning_rate": 0.018398820183331003, + "loss": 3.3492, + "mean_token_accuracy": 0.3587714433670044, + "num_tokens": 2052321405.0, + "step": 4014 + }, + { + "epoch": 1.0857220118983235, + "grad_norm": 3.84375, + "learning_rate": 0.018397922486354485, + "loss": 3.6702, + "mean_token_accuracy": 0.36630892753601074, + "num_tokens": 2052845485.0, + "step": 4015 + }, + { + "epoch": 1.085992428339643, + "grad_norm": 3.421875, + "learning_rate": 0.018397024562387954, + "loss": 3.4478, + "mean_token_accuracy": 0.363938570022583, + "num_tokens": 2053369730.0, + "step": 4016 + }, + { + "epoch": 1.0862628447809626, + "grad_norm": 2.796875, + "learning_rate": 0.018396126411458956, + "loss": 3.2484, + "mean_token_accuracy": 0.3959246277809143, + "num_tokens": 2053859738.0, + "step": 4017 + }, + { + "epoch": 1.0865332612222822, + "grad_norm": 2.234375, + "learning_rate": 0.01839522803359505, + "loss": 3.4656, + "mean_token_accuracy": 0.3642716705799103, + "num_tokens": 2054383956.0, + "step": 4018 + }, + { + "epoch": 1.0868036776636019, + "grad_norm": 2.8125, + "learning_rate": 0.018394329428823797, + "loss": 3.5768, + "mean_token_accuracy": 0.3507019877433777, + "num_tokens": 2054908210.0, + "step": 4019 + }, + { + "epoch": 1.0870740941049215, + "grad_norm": 2.984375, + "learning_rate": 0.018393430597172772, + "loss": 3.3906, + "mean_token_accuracy": 0.34885260462760925, + "num_tokens": 2055432367.0, + "step": 4020 + }, + { + "epoch": 1.0873445105462411, + "grad_norm": 2.828125, + "learning_rate": 0.018392531538669554, + "loss": 3.473, + "mean_token_accuracy": 0.36302459239959717, + "num_tokens": 2055906279.0, + "step": 4021 + }, + { + "epoch": 1.0876149269875608, + "grad_norm": 3.203125, + "learning_rate": 0.018391632253341728, + "loss": 3.368, + "mean_token_accuracy": 0.3805294930934906, + "num_tokens": 2056393742.0, + "step": 4022 + }, + { + "epoch": 1.0878853434288804, + "grad_norm": 3.40625, + "learning_rate": 0.018390732741216886, + "loss": 3.4353, + "mean_token_accuracy": 0.3808525800704956, + "num_tokens": 2056917903.0, + "step": 4023 + }, + { + "epoch": 1.0881557598702, + "grad_norm": 3.171875, + "learning_rate": 0.018389833002322633, + "loss": 3.226, + "mean_token_accuracy": 0.3750949203968048, + "num_tokens": 2057418216.0, + "step": 4024 + }, + { + "epoch": 1.0884261763115197, + "grad_norm": 3.796875, + "learning_rate": 0.01838893303668657, + "loss": 3.5161, + "mean_token_accuracy": 0.368211030960083, + "num_tokens": 2057923913.0, + "step": 4025 + }, + { + "epoch": 1.0886965927528394, + "grad_norm": 24.0, + "learning_rate": 0.01838803284433631, + "loss": 3.6233, + "mean_token_accuracy": 0.3386473059654236, + "num_tokens": 2058448116.0, + "step": 4026 + }, + { + "epoch": 1.088967009194159, + "grad_norm": 4.84375, + "learning_rate": 0.018387132425299477, + "loss": 3.5023, + "mean_token_accuracy": 0.35982024669647217, + "num_tokens": 2058972294.0, + "step": 4027 + }, + { + "epoch": 1.0892374256354787, + "grad_norm": 2.171875, + "learning_rate": 0.018386231779603697, + "loss": 3.2028, + "mean_token_accuracy": 0.3637882471084595, + "num_tokens": 2059484172.0, + "step": 4028 + }, + { + "epoch": 1.0895078420767983, + "grad_norm": 3.96875, + "learning_rate": 0.018385330907276603, + "loss": 3.3126, + "mean_token_accuracy": 0.3611595034599304, + "num_tokens": 2060008304.0, + "step": 4029 + }, + { + "epoch": 1.089778258518118, + "grad_norm": 3.0, + "learning_rate": 0.01838442980834584, + "loss": 3.454, + "mean_token_accuracy": 0.38277533650398254, + "num_tokens": 2060473844.0, + "step": 4030 + }, + { + "epoch": 1.0900486749594376, + "grad_norm": 2.984375, + "learning_rate": 0.018383528482839053, + "loss": 10.8601, + "mean_token_accuracy": 0.0, + "num_tokens": 2060997986.0, + "step": 4031 + }, + { + "epoch": 1.0903190914007572, + "grad_norm": 9.8125, + "learning_rate": 0.0183826269307839, + "loss": 4.3916, + "mean_token_accuracy": 0.2500603497028351, + "num_tokens": 2061522232.0, + "step": 4032 + }, + { + "epoch": 1.0905895078420769, + "grad_norm": 3.171875, + "learning_rate": 0.018381725152208036, + "loss": 3.6484, + "mean_token_accuracy": 0.3236253261566162, + "num_tokens": 2062046482.0, + "step": 4033 + }, + { + "epoch": 1.0908599242833965, + "grad_norm": 2.703125, + "learning_rate": 0.018380823147139142, + "loss": 3.3959, + "mean_token_accuracy": 0.369293212890625, + "num_tokens": 2062570719.0, + "step": 4034 + }, + { + "epoch": 1.0911303407247162, + "grad_norm": 8.4375, + "learning_rate": 0.01837992091560489, + "loss": 3.0805, + "mean_token_accuracy": 0.3707732558250427, + "num_tokens": 2063094802.0, + "step": 4035 + }, + { + "epoch": 1.0914007571660358, + "grad_norm": 2.671875, + "learning_rate": 0.01837901845763296, + "loss": 3.3882, + "mean_token_accuracy": 0.3374286890029907, + "num_tokens": 2063560773.0, + "step": 4036 + }, + { + "epoch": 1.0916711736073554, + "grad_norm": 18.5, + "learning_rate": 0.01837811577325104, + "loss": 3.5874, + "mean_token_accuracy": 0.3384869396686554, + "num_tokens": 2064085027.0, + "step": 4037 + }, + { + "epoch": 1.0919415900486749, + "grad_norm": 4.21875, + "learning_rate": 0.018377212862486832, + "loss": 3.4791, + "mean_token_accuracy": 0.358338326215744, + "num_tokens": 2064549404.0, + "step": 4038 + }, + { + "epoch": 1.0922120064899945, + "grad_norm": 2.1875, + "learning_rate": 0.018376309725368037, + "loss": 3.5352, + "mean_token_accuracy": 0.34293651580810547, + "num_tokens": 2065073533.0, + "step": 4039 + }, + { + "epoch": 1.0924824229313141, + "grad_norm": 3.796875, + "learning_rate": 0.018375406361922367, + "loss": 3.3779, + "mean_token_accuracy": 0.37433966994285583, + "num_tokens": 2065597638.0, + "step": 4040 + }, + { + "epoch": 1.0927528393726338, + "grad_norm": 2.828125, + "learning_rate": 0.018374502772177545, + "loss": 3.3296, + "mean_token_accuracy": 0.35760554671287537, + "num_tokens": 2066121797.0, + "step": 4041 + }, + { + "epoch": 1.0930232558139534, + "grad_norm": 2.71875, + "learning_rate": 0.01837359895616129, + "loss": 3.331, + "mean_token_accuracy": 0.37094932794570923, + "num_tokens": 2066646024.0, + "step": 4042 + }, + { + "epoch": 1.093293672255273, + "grad_norm": 3.171875, + "learning_rate": 0.018372694913901335, + "loss": 3.6615, + "mean_token_accuracy": 0.3530362546443939, + "num_tokens": 2067170264.0, + "step": 4043 + }, + { + "epoch": 1.0935640886965927, + "grad_norm": 3.125, + "learning_rate": 0.018371790645425413, + "loss": 3.4704, + "mean_token_accuracy": 0.3593124747276306, + "num_tokens": 2067694472.0, + "step": 4044 + }, + { + "epoch": 1.0938345051379124, + "grad_norm": 37.5, + "learning_rate": 0.018370886150761278, + "loss": 3.5238, + "mean_token_accuracy": 0.327823281288147, + "num_tokens": 2068218490.0, + "step": 4045 + }, + { + "epoch": 1.094104921579232, + "grad_norm": 4.8125, + "learning_rate": 0.01836998142993668, + "loss": 3.3971, + "mean_token_accuracy": 0.35719895362854004, + "num_tokens": 2068742623.0, + "step": 4046 + }, + { + "epoch": 1.0943753380205516, + "grad_norm": 1.65625, + "learning_rate": 0.01836907648297938, + "loss": 3.575, + "mean_token_accuracy": 0.35914021730422974, + "num_tokens": 2069266826.0, + "step": 4047 + }, + { + "epoch": 1.0946457544618713, + "grad_norm": 2.796875, + "learning_rate": 0.01836817130991714, + "loss": 3.4594, + "mean_token_accuracy": 0.36605751514434814, + "num_tokens": 2069749467.0, + "step": 4048 + }, + { + "epoch": 1.094916170903191, + "grad_norm": 2.78125, + "learning_rate": 0.018367265910777735, + "loss": 3.51, + "mean_token_accuracy": 0.3631952702999115, + "num_tokens": 2070273558.0, + "step": 4049 + }, + { + "epoch": 1.0951865873445106, + "grad_norm": 3.578125, + "learning_rate": 0.018366360285588947, + "loss": 3.3453, + "mean_token_accuracy": 0.36916041374206543, + "num_tokens": 2070797774.0, + "step": 4050 + }, + { + "epoch": 1.0954570037858302, + "grad_norm": 5.78125, + "learning_rate": 0.01836545443437856, + "loss": 9.4794, + "mean_token_accuracy": 0.03412650525569916, + "num_tokens": 2071257853.0, + "step": 4051 + }, + { + "epoch": 1.0957274202271499, + "grad_norm": 7.6875, + "learning_rate": 0.018364548357174375, + "loss": 3.8298, + "mean_token_accuracy": 0.36780309677124023, + "num_tokens": 2071720491.0, + "step": 4052 + }, + { + "epoch": 1.0959978366684695, + "grad_norm": 3.125, + "learning_rate": 0.018363642054004186, + "loss": 3.5528, + "mean_token_accuracy": 0.356937050819397, + "num_tokens": 2072244623.0, + "step": 4053 + }, + { + "epoch": 1.0962682531097891, + "grad_norm": 4.34375, + "learning_rate": 0.018362735524895805, + "loss": 3.3471, + "mean_token_accuracy": 0.3643946647644043, + "num_tokens": 2072768815.0, + "step": 4054 + }, + { + "epoch": 1.0965386695511088, + "grad_norm": 2.625, + "learning_rate": 0.018361828769877048, + "loss": 3.464, + "mean_token_accuracy": 0.3684970736503601, + "num_tokens": 2073235189.0, + "step": 4055 + }, + { + "epoch": 1.0968090859924284, + "grad_norm": 2.6875, + "learning_rate": 0.018360921788975727, + "loss": 3.4534, + "mean_token_accuracy": 0.38261228799819946, + "num_tokens": 2073759469.0, + "step": 4056 + }, + { + "epoch": 1.0970795024337479, + "grad_norm": 3.28125, + "learning_rate": 0.01836001458221969, + "loss": 3.43, + "mean_token_accuracy": 0.3827243149280548, + "num_tokens": 2074244309.0, + "step": 4057 + }, + { + "epoch": 1.0973499188750675, + "grad_norm": 3.4375, + "learning_rate": 0.01835910714963675, + "loss": 3.461, + "mean_token_accuracy": 0.3717348575592041, + "num_tokens": 2074703543.0, + "step": 4058 + }, + { + "epoch": 1.0976203353163871, + "grad_norm": 3.171875, + "learning_rate": 0.018358199491254765, + "loss": 3.1314, + "mean_token_accuracy": 0.4020949602127075, + "num_tokens": 2075185522.0, + "step": 4059 + }, + { + "epoch": 1.0978907517577068, + "grad_norm": 2.921875, + "learning_rate": 0.018357291607101582, + "loss": 3.4788, + "mean_token_accuracy": 0.36358311772346497, + "num_tokens": 2075709692.0, + "step": 4060 + }, + { + "epoch": 1.0981611681990264, + "grad_norm": 2.78125, + "learning_rate": 0.018356383497205055, + "loss": 3.2984, + "mean_token_accuracy": 0.39186155796051025, + "num_tokens": 2076175854.0, + "step": 4061 + }, + { + "epoch": 1.098431584640346, + "grad_norm": 2.8125, + "learning_rate": 0.01835547516159305, + "loss": 3.4345, + "mean_token_accuracy": 0.36702507734298706, + "num_tokens": 2076700049.0, + "step": 4062 + }, + { + "epoch": 1.0987020010816657, + "grad_norm": 2.53125, + "learning_rate": 0.018354566600293436, + "loss": 3.3605, + "mean_token_accuracy": 0.38285017013549805, + "num_tokens": 2077224318.0, + "step": 4063 + }, + { + "epoch": 1.0989724175229854, + "grad_norm": 3.15625, + "learning_rate": 0.018353657813334086, + "loss": 3.1381, + "mean_token_accuracy": 0.39505448937416077, + "num_tokens": 2077691154.0, + "step": 4064 + }, + { + "epoch": 1.099242833964305, + "grad_norm": 2.53125, + "learning_rate": 0.018352748800742894, + "loss": 3.4334, + "mean_token_accuracy": 0.3744688034057617, + "num_tokens": 2078208091.0, + "step": 4065 + }, + { + "epoch": 1.0995132504056246, + "grad_norm": 2.515625, + "learning_rate": 0.01835183956254774, + "loss": 3.2383, + "mean_token_accuracy": 0.3807169198989868, + "num_tokens": 2078719013.0, + "step": 4066 + }, + { + "epoch": 1.0997836668469443, + "grad_norm": 3.109375, + "learning_rate": 0.018350930098776534, + "loss": 3.5171, + "mean_token_accuracy": 0.35915684700012207, + "num_tokens": 2079243168.0, + "step": 4067 + }, + { + "epoch": 1.100054083288264, + "grad_norm": 2.46875, + "learning_rate": 0.01835002040945717, + "loss": 3.1004, + "mean_token_accuracy": 0.3943101763725281, + "num_tokens": 2079757453.0, + "step": 4068 + }, + { + "epoch": 1.1003244997295836, + "grad_norm": 2.890625, + "learning_rate": 0.018349110494617567, + "loss": 3.5211, + "mean_token_accuracy": 0.3559786379337311, + "num_tokens": 2080281657.0, + "step": 4069 + }, + { + "epoch": 1.1005949161709032, + "grad_norm": 3.25, + "learning_rate": 0.01834820035428564, + "loss": 3.2162, + "mean_token_accuracy": 0.3746817708015442, + "num_tokens": 2080805904.0, + "step": 4070 + }, + { + "epoch": 1.1008653326122229, + "grad_norm": 2.71875, + "learning_rate": 0.01834728998848932, + "loss": 11.0812, + "mean_token_accuracy": 3.1059757930052e-06, + "num_tokens": 2081330124.0, + "step": 4071 + }, + { + "epoch": 1.1011357490535425, + "grad_norm": 5.375, + "learning_rate": 0.018346379397256536, + "loss": 3.8107, + "mean_token_accuracy": 0.32510799169540405, + "num_tokens": 2081854396.0, + "step": 4072 + }, + { + "epoch": 1.1014061654948621, + "grad_norm": 3.203125, + "learning_rate": 0.018345468580615226, + "loss": 3.4741, + "mean_token_accuracy": 0.3651556968688965, + "num_tokens": 2082378634.0, + "step": 4073 + }, + { + "epoch": 1.1016765819361818, + "grad_norm": 4.21875, + "learning_rate": 0.018344557538593338, + "loss": 3.5186, + "mean_token_accuracy": 0.37581586837768555, + "num_tokens": 2082844742.0, + "step": 4074 + }, + { + "epoch": 1.1019469983775014, + "grad_norm": 2.03125, + "learning_rate": 0.018343646271218827, + "loss": 3.3571, + "mean_token_accuracy": 0.3690100312232971, + "num_tokens": 2083297432.0, + "step": 4075 + }, + { + "epoch": 1.102217414818821, + "grad_norm": 2.75, + "learning_rate": 0.01834273477851965, + "loss": 3.2971, + "mean_token_accuracy": 0.37948623299598694, + "num_tokens": 2083821616.0, + "step": 4076 + }, + { + "epoch": 1.1024878312601407, + "grad_norm": 3.484375, + "learning_rate": 0.01834182306052378, + "loss": 3.2331, + "mean_token_accuracy": 0.3798114061355591, + "num_tokens": 2084345771.0, + "step": 4077 + }, + { + "epoch": 1.1027582477014604, + "grad_norm": 3.28125, + "learning_rate": 0.018340911117259186, + "loss": 3.3564, + "mean_token_accuracy": 0.39158397912979126, + "num_tokens": 2084771102.0, + "step": 4078 + }, + { + "epoch": 1.1030286641427798, + "grad_norm": 3.421875, + "learning_rate": 0.01833999894875385, + "loss": 3.4417, + "mean_token_accuracy": 0.37044399976730347, + "num_tokens": 2085265995.0, + "step": 4079 + }, + { + "epoch": 1.1032990805840994, + "grad_norm": 2.859375, + "learning_rate": 0.018339086555035758, + "loss": 3.3576, + "mean_token_accuracy": 0.37203875184059143, + "num_tokens": 2085790195.0, + "step": 4080 + }, + { + "epoch": 1.103569497025419, + "grad_norm": 3.234375, + "learning_rate": 0.01833817393613291, + "loss": 3.4079, + "mean_token_accuracy": 0.35863256454467773, + "num_tokens": 2086314384.0, + "step": 4081 + }, + { + "epoch": 1.1038399134667387, + "grad_norm": 6.8125, + "learning_rate": 0.018337261092073308, + "loss": 3.4854, + "mean_token_accuracy": 0.3744860887527466, + "num_tokens": 2086838513.0, + "step": 4082 + }, + { + "epoch": 1.1041103299080584, + "grad_norm": 1.9453125, + "learning_rate": 0.018336348022884957, + "loss": 3.5768, + "mean_token_accuracy": 0.36536261439323425, + "num_tokens": 2087362762.0, + "step": 4083 + }, + { + "epoch": 1.104380746349378, + "grad_norm": 2.71875, + "learning_rate": 0.018335434728595872, + "loss": 3.0003, + "mean_token_accuracy": 0.39088448882102966, + "num_tokens": 2087839160.0, + "step": 4084 + }, + { + "epoch": 1.1046511627906976, + "grad_norm": 3.0, + "learning_rate": 0.018334521209234077, + "loss": 3.4629, + "mean_token_accuracy": 0.3629280626773834, + "num_tokens": 2088363275.0, + "step": 4085 + }, + { + "epoch": 1.1049215792320173, + "grad_norm": 3.515625, + "learning_rate": 0.018333607464827605, + "loss": 3.5125, + "mean_token_accuracy": 0.37546810507774353, + "num_tokens": 2088887558.0, + "step": 4086 + }, + { + "epoch": 1.105191995673337, + "grad_norm": 4.15625, + "learning_rate": 0.018332693495404485, + "loss": 3.6844, + "mean_token_accuracy": 0.3494536280632019, + "num_tokens": 2089348515.0, + "step": 4087 + }, + { + "epoch": 1.1054624121146566, + "grad_norm": 5.84375, + "learning_rate": 0.018331779300992772, + "loss": 2.8737, + "mean_token_accuracy": 0.43180567026138306, + "num_tokens": 2089817832.0, + "step": 4088 + }, + { + "epoch": 1.1057328285559762, + "grad_norm": 3.15625, + "learning_rate": 0.0183308648816205, + "loss": 3.2944, + "mean_token_accuracy": 0.3944166898727417, + "num_tokens": 2090295376.0, + "step": 4089 + }, + { + "epoch": 1.1060032449972959, + "grad_norm": 3.796875, + "learning_rate": 0.01832995023731574, + "loss": 3.5127, + "mean_token_accuracy": 0.3686339855194092, + "num_tokens": 2090819659.0, + "step": 4090 + }, + { + "epoch": 1.1062736614386155, + "grad_norm": 12.25, + "learning_rate": 0.01832903536810655, + "loss": 9.095, + "mean_token_accuracy": 0.03671751916408539, + "num_tokens": 2091343755.0, + "step": 4091 + }, + { + "epoch": 1.1065440778799351, + "grad_norm": 5.75, + "learning_rate": 0.018328120274021, + "loss": 3.8092, + "mean_token_accuracy": 0.3453200161457062, + "num_tokens": 2091868037.0, + "step": 4092 + }, + { + "epoch": 1.1068144943212548, + "grad_norm": 2.5, + "learning_rate": 0.018327204955087172, + "loss": 3.5638, + "mean_token_accuracy": 0.3502558469772339, + "num_tokens": 2092392205.0, + "step": 4093 + }, + { + "epoch": 1.1070849107625744, + "grad_norm": 4.21875, + "learning_rate": 0.018326289411333147, + "loss": 3.6749, + "mean_token_accuracy": 0.31084662675857544, + "num_tokens": 2092916473.0, + "step": 4094 + }, + { + "epoch": 1.107355327203894, + "grad_norm": 3.515625, + "learning_rate": 0.018325373642787017, + "loss": 3.4536, + "mean_token_accuracy": 0.36055421829223633, + "num_tokens": 2093440577.0, + "step": 4095 + }, + { + "epoch": 1.1076257436452137, + "grad_norm": 2.375, + "learning_rate": 0.018324457649476877, + "loss": 3.3259, + "mean_token_accuracy": 0.3976368010044098, + "num_tokens": 2093924796.0, + "step": 4096 + }, + { + "epoch": 1.1078961600865334, + "grad_norm": 3.09375, + "learning_rate": 0.018323541431430845, + "loss": 3.2005, + "mean_token_accuracy": 0.39068153500556946, + "num_tokens": 2094449073.0, + "step": 4097 + }, + { + "epoch": 1.1081665765278528, + "grad_norm": 2.640625, + "learning_rate": 0.018322624988677018, + "loss": 3.1995, + "mean_token_accuracy": 0.35122427344322205, + "num_tokens": 2094973307.0, + "step": 4098 + }, + { + "epoch": 1.1084369929691724, + "grad_norm": 3.171875, + "learning_rate": 0.018321708321243524, + "loss": 3.3868, + "mean_token_accuracy": 0.36918359994888306, + "num_tokens": 2095497486.0, + "step": 4099 + }, + { + "epoch": 1.108707409410492, + "grad_norm": 3.578125, + "learning_rate": 0.018320791429158486, + "loss": 3.4836, + "mean_token_accuracy": 0.3712773323059082, + "num_tokens": 2096021671.0, + "step": 4100 + }, + { + "epoch": 1.1089778258518117, + "grad_norm": 3.28125, + "learning_rate": 0.01831987431245004, + "loss": 3.4889, + "mean_token_accuracy": 0.35557782649993896, + "num_tokens": 2096540174.0, + "step": 4101 + }, + { + "epoch": 1.1092482422931313, + "grad_norm": 2.5625, + "learning_rate": 0.018318956971146324, + "loss": 3.4948, + "mean_token_accuracy": 0.367106169462204, + "num_tokens": 2097064420.0, + "step": 4102 + }, + { + "epoch": 1.109518658734451, + "grad_norm": 3.171875, + "learning_rate": 0.018318039405275485, + "loss": 3.348, + "mean_token_accuracy": 0.37243223190307617, + "num_tokens": 2097536701.0, + "step": 4103 + }, + { + "epoch": 1.1097890751757706, + "grad_norm": 3.046875, + "learning_rate": 0.018317121614865674, + "loss": 3.299, + "mean_token_accuracy": 0.3860001564025879, + "num_tokens": 2098026553.0, + "step": 4104 + }, + { + "epoch": 1.1100594916170903, + "grad_norm": 2.734375, + "learning_rate": 0.01831620359994505, + "loss": 3.3833, + "mean_token_accuracy": 0.3678569793701172, + "num_tokens": 2098550776.0, + "step": 4105 + }, + { + "epoch": 1.11032990805841, + "grad_norm": 2.234375, + "learning_rate": 0.018315285360541795, + "loss": 3.2283, + "mean_token_accuracy": 0.3967941403388977, + "num_tokens": 2099075060.0, + "step": 4106 + }, + { + "epoch": 1.1106003244997296, + "grad_norm": 3.140625, + "learning_rate": 0.01831436689668406, + "loss": 3.3632, + "mean_token_accuracy": 0.33288103342056274, + "num_tokens": 2099599170.0, + "step": 4107 + }, + { + "epoch": 1.1108707409410492, + "grad_norm": 2.3125, + "learning_rate": 0.018313448208400047, + "loss": 3.1696, + "mean_token_accuracy": 0.3851485848426819, + "num_tokens": 2100123235.0, + "step": 4108 + }, + { + "epoch": 1.1111411573823688, + "grad_norm": 3.03125, + "learning_rate": 0.018312529295717934, + "loss": 3.49, + "mean_token_accuracy": 0.3692573606967926, + "num_tokens": 2100574324.0, + "step": 4109 + }, + { + "epoch": 1.1114115738236885, + "grad_norm": 2.765625, + "learning_rate": 0.018311610158665913, + "loss": 3.4266, + "mean_token_accuracy": 0.3665526211261749, + "num_tokens": 2101098528.0, + "step": 4110 + }, + { + "epoch": 1.1116819902650081, + "grad_norm": 2.4375, + "learning_rate": 0.018310690797272194, + "loss": 11.1839, + "mean_token_accuracy": 7.188449671957642e-05, + "num_tokens": 2101622727.0, + "step": 4111 + }, + { + "epoch": 1.1119524067063278, + "grad_norm": 6.46875, + "learning_rate": 0.01830977121156498, + "loss": 3.9845, + "mean_token_accuracy": 0.2942502498626709, + "num_tokens": 2102146929.0, + "step": 4112 + }, + { + "epoch": 1.1122228231476474, + "grad_norm": 2.90625, + "learning_rate": 0.01830885140157249, + "loss": 3.6507, + "mean_token_accuracy": 0.3422532379627228, + "num_tokens": 2102630186.0, + "step": 4113 + }, + { + "epoch": 1.112493239588967, + "grad_norm": 3.359375, + "learning_rate": 0.018307931367322946, + "loss": 3.4386, + "mean_token_accuracy": 0.3758065104484558, + "num_tokens": 2103154207.0, + "step": 4114 + }, + { + "epoch": 1.1127636560302867, + "grad_norm": 2.6875, + "learning_rate": 0.018307011108844574, + "loss": 3.5332, + "mean_token_accuracy": 0.34567689895629883, + "num_tokens": 2103678393.0, + "step": 4115 + }, + { + "epoch": 1.1130340724716064, + "grad_norm": 2.984375, + "learning_rate": 0.018306090626165613, + "loss": 3.2857, + "mean_token_accuracy": 0.35270529985427856, + "num_tokens": 2104202605.0, + "step": 4116 + }, + { + "epoch": 1.113304488912926, + "grad_norm": 3.15625, + "learning_rate": 0.01830516991931431, + "loss": 3.3779, + "mean_token_accuracy": 0.3786579370498657, + "num_tokens": 2104726725.0, + "step": 4117 + }, + { + "epoch": 1.1135749053542456, + "grad_norm": 2.65625, + "learning_rate": 0.018304248988318905, + "loss": 3.5662, + "mean_token_accuracy": 0.35397619009017944, + "num_tokens": 2105250865.0, + "step": 4118 + }, + { + "epoch": 1.1138453217955653, + "grad_norm": 2.359375, + "learning_rate": 0.018303327833207664, + "loss": 3.1963, + "mean_token_accuracy": 0.37757980823516846, + "num_tokens": 2105775017.0, + "step": 4119 + }, + { + "epoch": 1.1141157382368847, + "grad_norm": 3.1875, + "learning_rate": 0.018302406454008848, + "loss": 3.329, + "mean_token_accuracy": 0.37225228548049927, + "num_tokens": 2106299170.0, + "step": 4120 + }, + { + "epoch": 1.1143861546782043, + "grad_norm": 3.390625, + "learning_rate": 0.018301484850750723, + "loss": 3.2475, + "mean_token_accuracy": 0.3687160015106201, + "num_tokens": 2106809365.0, + "step": 4121 + }, + { + "epoch": 1.114656571119524, + "grad_norm": 2.5, + "learning_rate": 0.018300563023461575, + "loss": 3.4137, + "mean_token_accuracy": 0.39447522163391113, + "num_tokens": 2107273948.0, + "step": 4122 + }, + { + "epoch": 1.1149269875608436, + "grad_norm": 3.65625, + "learning_rate": 0.01829964097216968, + "loss": 3.487, + "mean_token_accuracy": 0.34787559509277344, + "num_tokens": 2107798161.0, + "step": 4123 + }, + { + "epoch": 1.1151974040021633, + "grad_norm": 3.1875, + "learning_rate": 0.018298718696903336, + "loss": 3.3267, + "mean_token_accuracy": 0.39945563673973083, + "num_tokens": 2108322377.0, + "step": 4124 + }, + { + "epoch": 1.115467820443483, + "grad_norm": 26.0, + "learning_rate": 0.018297796197690838, + "loss": 3.4879, + "mean_token_accuracy": 0.34706059098243713, + "num_tokens": 2108846640.0, + "step": 4125 + }, + { + "epoch": 1.1157382368848026, + "grad_norm": 4.0625, + "learning_rate": 0.01829687347456049, + "loss": 3.6261, + "mean_token_accuracy": 0.3522854447364807, + "num_tokens": 2109370837.0, + "step": 4126 + }, + { + "epoch": 1.1160086533261222, + "grad_norm": 2.125, + "learning_rate": 0.018295950527540603, + "loss": 3.1265, + "mean_token_accuracy": 0.40572649240493774, + "num_tokens": 2109870447.0, + "step": 4127 + }, + { + "epoch": 1.1162790697674418, + "grad_norm": 3.078125, + "learning_rate": 0.018295027356659502, + "loss": 3.621, + "mean_token_accuracy": 0.3502650260925293, + "num_tokens": 2110394682.0, + "step": 4128 + }, + { + "epoch": 1.1165494862087615, + "grad_norm": 2.53125, + "learning_rate": 0.01829410396194551, + "loss": 3.2857, + "mean_token_accuracy": 0.36172062158584595, + "num_tokens": 2110918774.0, + "step": 4129 + }, + { + "epoch": 1.1168199026500811, + "grad_norm": 2.9375, + "learning_rate": 0.01829318034342695, + "loss": 3.4499, + "mean_token_accuracy": 0.35972321033477783, + "num_tokens": 2111394089.0, + "step": 4130 + }, + { + "epoch": 1.1170903190914008, + "grad_norm": 6.90625, + "learning_rate": 0.018292256501132177, + "loss": 8.8614, + "mean_token_accuracy": 0.024482663720846176, + "num_tokens": 2111918356.0, + "step": 4131 + }, + { + "epoch": 1.1173607355327204, + "grad_norm": 6.625, + "learning_rate": 0.018291332435089524, + "loss": 4.1153, + "mean_token_accuracy": 0.298650860786438, + "num_tokens": 2112442631.0, + "step": 4132 + }, + { + "epoch": 1.11763115197404, + "grad_norm": 2.328125, + "learning_rate": 0.018290408145327352, + "loss": 3.5395, + "mean_token_accuracy": 0.3563840687274933, + "num_tokens": 2112966898.0, + "step": 4133 + }, + { + "epoch": 1.1179015684153597, + "grad_norm": 5.34375, + "learning_rate": 0.01828948363187402, + "loss": 3.5037, + "mean_token_accuracy": 0.38650062680244446, + "num_tokens": 2113320809.0, + "step": 4134 + }, + { + "epoch": 1.1181719848566793, + "grad_norm": 2.671875, + "learning_rate": 0.01828855889475789, + "loss": 3.4583, + "mean_token_accuracy": 0.3683500289916992, + "num_tokens": 2113784525.0, + "step": 4135 + }, + { + "epoch": 1.118442401297999, + "grad_norm": 3.703125, + "learning_rate": 0.01828763393400734, + "loss": 3.5867, + "mean_token_accuracy": 0.34693413972854614, + "num_tokens": 2114308717.0, + "step": 4136 + }, + { + "epoch": 1.1187128177393186, + "grad_norm": 3.40625, + "learning_rate": 0.01828670874965075, + "loss": 3.4723, + "mean_token_accuracy": 0.3791412115097046, + "num_tokens": 2114832862.0, + "step": 4137 + }, + { + "epoch": 1.1189832341806383, + "grad_norm": 3.15625, + "learning_rate": 0.018285783341716506, + "loss": 3.5634, + "mean_token_accuracy": 0.3573034405708313, + "num_tokens": 2115357000.0, + "step": 4138 + }, + { + "epoch": 1.119253650621958, + "grad_norm": 2.625, + "learning_rate": 0.018284857710233002, + "loss": 3.4254, + "mean_token_accuracy": 0.37361276149749756, + "num_tokens": 2115881228.0, + "step": 4139 + }, + { + "epoch": 1.1195240670632773, + "grad_norm": 41.25, + "learning_rate": 0.018283931855228645, + "loss": 3.4916, + "mean_token_accuracy": 0.3397008776664734, + "num_tokens": 2116405408.0, + "step": 4140 + }, + { + "epoch": 1.119794483504597, + "grad_norm": 4.9375, + "learning_rate": 0.018283005776731833, + "loss": 3.6103, + "mean_token_accuracy": 0.34742510318756104, + "num_tokens": 2116906983.0, + "step": 4141 + }, + { + "epoch": 1.1200648999459166, + "grad_norm": 2.0625, + "learning_rate": 0.018282079474770986, + "loss": 3.4322, + "mean_token_accuracy": 0.35398170351982117, + "num_tokens": 2117431258.0, + "step": 4142 + }, + { + "epoch": 1.1203353163872363, + "grad_norm": 4.0, + "learning_rate": 0.018281152949374525, + "loss": 3.3401, + "mean_token_accuracy": 0.3844928443431854, + "num_tokens": 2117894995.0, + "step": 4143 + }, + { + "epoch": 1.120605732828556, + "grad_norm": 3.46875, + "learning_rate": 0.018280226200570883, + "loss": 3.4915, + "mean_token_accuracy": 0.35656750202178955, + "num_tokens": 2118419224.0, + "step": 4144 + }, + { + "epoch": 1.1208761492698756, + "grad_norm": 9.4375, + "learning_rate": 0.01827929922838849, + "loss": 3.4083, + "mean_token_accuracy": 0.4052891135215759, + "num_tokens": 2118918827.0, + "step": 4145 + }, + { + "epoch": 1.1211465657111952, + "grad_norm": 2.25, + "learning_rate": 0.01827837203285579, + "loss": 3.5144, + "mean_token_accuracy": 0.3363327383995056, + "num_tokens": 2119443079.0, + "step": 4146 + }, + { + "epoch": 1.1214169821525148, + "grad_norm": 4.21875, + "learning_rate": 0.01827744461400123, + "loss": 3.6385, + "mean_token_accuracy": 0.3505948781967163, + "num_tokens": 2119967278.0, + "step": 4147 + }, + { + "epoch": 1.1216873985938345, + "grad_norm": 2.53125, + "learning_rate": 0.018276516971853268, + "loss": 3.4323, + "mean_token_accuracy": 0.3857431411743164, + "num_tokens": 2120416829.0, + "step": 4148 + }, + { + "epoch": 1.1219578150351541, + "grad_norm": 3.15625, + "learning_rate": 0.018275589106440365, + "loss": 3.4247, + "mean_token_accuracy": 0.4088192582130432, + "num_tokens": 2120877879.0, + "step": 4149 + }, + { + "epoch": 1.1222282314764738, + "grad_norm": 3.0, + "learning_rate": 0.018274661017790995, + "loss": 3.2917, + "mean_token_accuracy": 0.3716355562210083, + "num_tokens": 2121402057.0, + "step": 4150 + }, + { + "epoch": 1.1224986479177934, + "grad_norm": 2.28125, + "learning_rate": 0.018273732705933632, + "loss": 11.2877, + "mean_token_accuracy": 0.0, + "num_tokens": 2121926335.0, + "step": 4151 + }, + { + "epoch": 1.122769064359113, + "grad_norm": 6.0625, + "learning_rate": 0.018272804170896755, + "loss": 3.8311, + "mean_token_accuracy": 0.3076149821281433, + "num_tokens": 2122413721.0, + "step": 4152 + }, + { + "epoch": 1.1230394808004327, + "grad_norm": 1.9453125, + "learning_rate": 0.01827187541270886, + "loss": 3.4528, + "mean_token_accuracy": 0.3830110430717468, + "num_tokens": 2122937903.0, + "step": 4153 + }, + { + "epoch": 1.1233098972417523, + "grad_norm": 2.40625, + "learning_rate": 0.018270946431398444, + "loss": 3.3806, + "mean_token_accuracy": 0.3528555929660797, + "num_tokens": 2123462128.0, + "step": 4154 + }, + { + "epoch": 1.123580313683072, + "grad_norm": 2.765625, + "learning_rate": 0.01827001722699401, + "loss": 3.4482, + "mean_token_accuracy": 0.3884667754173279, + "num_tokens": 2123936375.0, + "step": 4155 + }, + { + "epoch": 1.1238507301243916, + "grad_norm": 3.375, + "learning_rate": 0.01826908779952406, + "loss": 3.5434, + "mean_token_accuracy": 0.36312800645828247, + "num_tokens": 2124455097.0, + "step": 4156 + }, + { + "epoch": 1.1241211465657113, + "grad_norm": 12.0625, + "learning_rate": 0.018268158149017128, + "loss": 2.9746, + "mean_token_accuracy": 0.39220669865608215, + "num_tokens": 2124979139.0, + "step": 4157 + }, + { + "epoch": 1.124391563007031, + "grad_norm": 2.671875, + "learning_rate": 0.01826722827550173, + "loss": 3.3277, + "mean_token_accuracy": 0.37648066878318787, + "num_tokens": 2125503314.0, + "step": 4158 + }, + { + "epoch": 1.1246619794483506, + "grad_norm": 3.5, + "learning_rate": 0.01826629817900639, + "loss": 3.1583, + "mean_token_accuracy": 0.386184960603714, + "num_tokens": 2126021157.0, + "step": 4159 + }, + { + "epoch": 1.1249323958896702, + "grad_norm": 3.703125, + "learning_rate": 0.018265367859559657, + "loss": 3.4748, + "mean_token_accuracy": 0.34651893377304077, + "num_tokens": 2126545336.0, + "step": 4160 + }, + { + "epoch": 1.1252028123309898, + "grad_norm": 2.609375, + "learning_rate": 0.01826443731719007, + "loss": 3.2941, + "mean_token_accuracy": 0.39904361963272095, + "num_tokens": 2127010341.0, + "step": 4161 + }, + { + "epoch": 1.1254732287723093, + "grad_norm": 3.09375, + "learning_rate": 0.018263506551926184, + "loss": 3.3273, + "mean_token_accuracy": 0.37097758054733276, + "num_tokens": 2127512667.0, + "step": 4162 + }, + { + "epoch": 1.125743645213629, + "grad_norm": 3.0625, + "learning_rate": 0.018262575563796558, + "loss": 3.3509, + "mean_token_accuracy": 0.3696843981742859, + "num_tokens": 2128036840.0, + "step": 4163 + }, + { + "epoch": 1.1260140616549486, + "grad_norm": 3.453125, + "learning_rate": 0.018261644352829755, + "loss": 3.4436, + "mean_token_accuracy": 0.3698648512363434, + "num_tokens": 2128560978.0, + "step": 4164 + }, + { + "epoch": 1.1262844780962682, + "grad_norm": 3.484375, + "learning_rate": 0.01826071291905435, + "loss": 3.1783, + "mean_token_accuracy": 0.3764200806617737, + "num_tokens": 2129073113.0, + "step": 4165 + }, + { + "epoch": 1.1265548945375878, + "grad_norm": 2.9375, + "learning_rate": 0.018259781262498914, + "loss": 3.5266, + "mean_token_accuracy": 0.35327327251434326, + "num_tokens": 2129597378.0, + "step": 4166 + }, + { + "epoch": 1.1268253109789075, + "grad_norm": 3.359375, + "learning_rate": 0.018258849383192045, + "loss": 3.1613, + "mean_token_accuracy": 0.3856881260871887, + "num_tokens": 2130121412.0, + "step": 4167 + }, + { + "epoch": 1.1270957274202271, + "grad_norm": 2.78125, + "learning_rate": 0.018257917281162328, + "loss": 3.3335, + "mean_token_accuracy": 0.37145811319351196, + "num_tokens": 2130645598.0, + "step": 4168 + }, + { + "epoch": 1.1273661438615468, + "grad_norm": 3.203125, + "learning_rate": 0.018256984956438366, + "loss": 3.3155, + "mean_token_accuracy": 0.3915233612060547, + "num_tokens": 2131135309.0, + "step": 4169 + }, + { + "epoch": 1.1276365603028664, + "grad_norm": 2.296875, + "learning_rate": 0.018256052409048762, + "loss": 3.3564, + "mean_token_accuracy": 0.38689690828323364, + "num_tokens": 2131659489.0, + "step": 4170 + }, + { + "epoch": 1.127906976744186, + "grad_norm": 0.478515625, + "learning_rate": 0.018255119639022134, + "loss": 8.6072, + "mean_token_accuracy": 0.03983915597200394, + "num_tokens": 2132183684.0, + "step": 4171 + }, + { + "epoch": 1.1281773931855057, + "grad_norm": 8.6875, + "learning_rate": 0.018254186646387095, + "loss": 4.0044, + "mean_token_accuracy": 0.29808855056762695, + "num_tokens": 2132707860.0, + "step": 4172 + }, + { + "epoch": 1.1284478096268253, + "grad_norm": 3.984375, + "learning_rate": 0.018253253431172282, + "loss": 3.6485, + "mean_token_accuracy": 0.3970223069190979, + "num_tokens": 2133171088.0, + "step": 4173 + }, + { + "epoch": 1.128718226068145, + "grad_norm": 3.46875, + "learning_rate": 0.018252319993406322, + "loss": 3.4, + "mean_token_accuracy": 0.3705807328224182, + "num_tokens": 2133695249.0, + "step": 4174 + }, + { + "epoch": 1.1289886425094646, + "grad_norm": 3.25, + "learning_rate": 0.018251386333117854, + "loss": 3.5277, + "mean_token_accuracy": 0.36385178565979004, + "num_tokens": 2134200133.0, + "step": 4175 + }, + { + "epoch": 1.1292590589507843, + "grad_norm": 3.3125, + "learning_rate": 0.018250452450335533, + "loss": 3.4088, + "mean_token_accuracy": 0.3853432238101959, + "num_tokens": 2134672534.0, + "step": 4176 + }, + { + "epoch": 1.129529475392104, + "grad_norm": 3.703125, + "learning_rate": 0.018249518345088003, + "loss": 3.466, + "mean_token_accuracy": 0.34949401021003723, + "num_tokens": 2135196668.0, + "step": 4177 + }, + { + "epoch": 1.1297998918334236, + "grad_norm": 3.90625, + "learning_rate": 0.018248584017403934, + "loss": 3.8182, + "mean_token_accuracy": 0.3546130657196045, + "num_tokens": 2135720847.0, + "step": 4178 + }, + { + "epoch": 1.1300703082747432, + "grad_norm": 3.453125, + "learning_rate": 0.01824764946731199, + "loss": 3.4328, + "mean_token_accuracy": 0.3642567992210388, + "num_tokens": 2136202908.0, + "step": 4179 + }, + { + "epoch": 1.1303407247160626, + "grad_norm": 3.3125, + "learning_rate": 0.018246714694840845, + "loss": 3.3101, + "mean_token_accuracy": 0.36342334747314453, + "num_tokens": 2136727093.0, + "step": 4180 + }, + { + "epoch": 1.1306111411573823, + "grad_norm": 3.625, + "learning_rate": 0.01824577970001919, + "loss": 3.6189, + "mean_token_accuracy": 0.3303282856941223, + "num_tokens": 2137251305.0, + "step": 4181 + }, + { + "epoch": 1.130881557598702, + "grad_norm": 3.265625, + "learning_rate": 0.018244844482875694, + "loss": 3.5496, + "mean_token_accuracy": 0.37281203269958496, + "num_tokens": 2137736199.0, + "step": 4182 + }, + { + "epoch": 1.1311519740400215, + "grad_norm": 3.375, + "learning_rate": 0.018243909043439067, + "loss": 3.4033, + "mean_token_accuracy": 0.358181357383728, + "num_tokens": 2138211819.0, + "step": 4183 + }, + { + "epoch": 1.1314223904813412, + "grad_norm": 3.234375, + "learning_rate": 0.018242973381738012, + "loss": 3.3464, + "mean_token_accuracy": 0.36738121509552, + "num_tokens": 2138736058.0, + "step": 4184 + }, + { + "epoch": 1.1316928069226608, + "grad_norm": 3.234375, + "learning_rate": 0.01824203749780123, + "loss": 3.4783, + "mean_token_accuracy": 0.37076038122177124, + "num_tokens": 2139251801.0, + "step": 4185 + }, + { + "epoch": 1.1319632233639805, + "grad_norm": 3.40625, + "learning_rate": 0.018241101391657442, + "loss": 3.4137, + "mean_token_accuracy": 0.36700499057769775, + "num_tokens": 2139775881.0, + "step": 4186 + }, + { + "epoch": 1.1322336398053001, + "grad_norm": 2.28125, + "learning_rate": 0.018240165063335364, + "loss": 3.3067, + "mean_token_accuracy": 0.372741162776947, + "num_tokens": 2140300051.0, + "step": 4187 + }, + { + "epoch": 1.1325040562466198, + "grad_norm": 3.59375, + "learning_rate": 0.018239228512863734, + "loss": 3.4476, + "mean_token_accuracy": 0.3473724126815796, + "num_tokens": 2140799367.0, + "step": 4188 + }, + { + "epoch": 1.1327744726879394, + "grad_norm": 3.546875, + "learning_rate": 0.018238291740271285, + "loss": 3.1907, + "mean_token_accuracy": 0.3757643699645996, + "num_tokens": 2141323609.0, + "step": 4189 + }, + { + "epoch": 1.133044889129259, + "grad_norm": 2.859375, + "learning_rate": 0.018237354745586756, + "loss": 3.2955, + "mean_token_accuracy": 0.35609447956085205, + "num_tokens": 2141847840.0, + "step": 4190 + }, + { + "epoch": 1.1333153055705787, + "grad_norm": 5.4375, + "learning_rate": 0.0182364175288389, + "loss": 9.2995, + "mean_token_accuracy": 2.7979228889307706e-06, + "num_tokens": 2142372012.0, + "step": 4191 + }, + { + "epoch": 1.1335857220118983, + "grad_norm": 6.84375, + "learning_rate": 0.018235480090056475, + "loss": 3.9047, + "mean_token_accuracy": 0.3161478638648987, + "num_tokens": 2142896278.0, + "step": 4192 + }, + { + "epoch": 1.133856138453218, + "grad_norm": 2.3125, + "learning_rate": 0.018234542429268245, + "loss": 3.6845, + "mean_token_accuracy": 0.3439146876335144, + "num_tokens": 2143420556.0, + "step": 4193 + }, + { + "epoch": 1.1341265548945376, + "grad_norm": 3.109375, + "learning_rate": 0.018233604546502972, + "loss": 3.4156, + "mean_token_accuracy": 0.34290361404418945, + "num_tokens": 2143944841.0, + "step": 4194 + }, + { + "epoch": 1.1343969713358573, + "grad_norm": 3.015625, + "learning_rate": 0.018232666441789443, + "loss": 3.3321, + "mean_token_accuracy": 0.3727424144744873, + "num_tokens": 2144469079.0, + "step": 4195 + }, + { + "epoch": 1.134667387777177, + "grad_norm": 2.78125, + "learning_rate": 0.01823172811515644, + "loss": 3.5651, + "mean_token_accuracy": 0.3508201837539673, + "num_tokens": 2144949000.0, + "step": 4196 + }, + { + "epoch": 1.1349378042184965, + "grad_norm": 3.421875, + "learning_rate": 0.01823078956663274, + "loss": 3.433, + "mean_token_accuracy": 0.37677010893821716, + "num_tokens": 2145473163.0, + "step": 4197 + }, + { + "epoch": 1.1352082206598162, + "grad_norm": 3.46875, + "learning_rate": 0.018229850796247164, + "loss": 3.2665, + "mean_token_accuracy": 0.38601958751678467, + "num_tokens": 2145960322.0, + "step": 4198 + }, + { + "epoch": 1.1354786371011358, + "grad_norm": 4.15625, + "learning_rate": 0.018228911804028497, + "loss": 3.5541, + "mean_token_accuracy": 0.3854583501815796, + "num_tokens": 2146439830.0, + "step": 4199 + }, + { + "epoch": 1.1357490535424555, + "grad_norm": 3.203125, + "learning_rate": 0.01822797259000556, + "loss": 3.4695, + "mean_token_accuracy": 0.36250555515289307, + "num_tokens": 2146964067.0, + "step": 4200 + }, + { + "epoch": 1.1360194699837751, + "grad_norm": 2.71875, + "learning_rate": 0.018227033154207168, + "loss": 3.5001, + "mean_token_accuracy": 0.37897321581840515, + "num_tokens": 2147488331.0, + "step": 4201 + }, + { + "epoch": 1.1362898864250948, + "grad_norm": 3.453125, + "learning_rate": 0.018226093496662148, + "loss": 3.4454, + "mean_token_accuracy": 0.3644940257072449, + "num_tokens": 2148012612.0, + "step": 4202 + }, + { + "epoch": 1.1365603028664142, + "grad_norm": 2.515625, + "learning_rate": 0.018225153617399324, + "loss": 3.2065, + "mean_token_accuracy": 0.3850610852241516, + "num_tokens": 2148536791.0, + "step": 4203 + }, + { + "epoch": 1.1368307193077338, + "grad_norm": 3.5, + "learning_rate": 0.018224213516447537, + "loss": 2.9912, + "mean_token_accuracy": 0.3942390978336334, + "num_tokens": 2149061012.0, + "step": 4204 + }, + { + "epoch": 1.1371011357490535, + "grad_norm": 2.84375, + "learning_rate": 0.01822327319383564, + "loss": 3.1891, + "mean_token_accuracy": 0.3432139754295349, + "num_tokens": 2149585169.0, + "step": 4205 + }, + { + "epoch": 1.1373715521903731, + "grad_norm": 3.6875, + "learning_rate": 0.018222332649592478, + "loss": 3.4637, + "mean_token_accuracy": 0.4154176115989685, + "num_tokens": 2150109275.0, + "step": 4206 + }, + { + "epoch": 1.1376419686316928, + "grad_norm": 2.875, + "learning_rate": 0.01822139188374691, + "loss": 3.3582, + "mean_token_accuracy": 0.3543373942375183, + "num_tokens": 2150633525.0, + "step": 4207 + }, + { + "epoch": 1.1379123850730124, + "grad_norm": 2.546875, + "learning_rate": 0.0182204508963278, + "loss": 3.231, + "mean_token_accuracy": 0.3910582363605499, + "num_tokens": 2151157718.0, + "step": 4208 + }, + { + "epoch": 1.138182801514332, + "grad_norm": 2.9375, + "learning_rate": 0.018219509687364025, + "loss": 3.3225, + "mean_token_accuracy": 0.39051347970962524, + "num_tokens": 2151681883.0, + "step": 4209 + }, + { + "epoch": 1.1384532179556517, + "grad_norm": 3.34375, + "learning_rate": 0.018218568256884466, + "loss": 3.6294, + "mean_token_accuracy": 0.37341973185539246, + "num_tokens": 2152165765.0, + "step": 4210 + }, + { + "epoch": 1.1387236343969713, + "grad_norm": 15.0, + "learning_rate": 0.018217626604918, + "loss": 9.9296, + "mean_token_accuracy": 0.007752739358693361, + "num_tokens": 2152689955.0, + "step": 4211 + }, + { + "epoch": 1.138994050838291, + "grad_norm": 7.8125, + "learning_rate": 0.01821668473149352, + "loss": 3.7949, + "mean_token_accuracy": 0.35726198554039, + "num_tokens": 2153090160.0, + "step": 4212 + }, + { + "epoch": 1.1392644672796106, + "grad_norm": 2.78125, + "learning_rate": 0.018215742636639935, + "loss": 3.4802, + "mean_token_accuracy": 0.3667563796043396, + "num_tokens": 2153614362.0, + "step": 4213 + }, + { + "epoch": 1.1395348837209303, + "grad_norm": 2.75, + "learning_rate": 0.018214800320386144, + "loss": 3.166, + "mean_token_accuracy": 0.3982953131198883, + "num_tokens": 2154138631.0, + "step": 4214 + }, + { + "epoch": 1.13980530016225, + "grad_norm": 3.296875, + "learning_rate": 0.018213857782761064, + "loss": 3.3178, + "mean_token_accuracy": 0.38971149921417236, + "num_tokens": 2154662810.0, + "step": 4215 + }, + { + "epoch": 1.1400757166035695, + "grad_norm": 3.515625, + "learning_rate": 0.01821291502379361, + "loss": 3.6908, + "mean_token_accuracy": 0.365443617105484, + "num_tokens": 2155187072.0, + "step": 4216 + }, + { + "epoch": 1.1403461330448892, + "grad_norm": 2.96875, + "learning_rate": 0.01821197204351271, + "loss": 3.2, + "mean_token_accuracy": 0.36963462829589844, + "num_tokens": 2155698551.0, + "step": 4217 + }, + { + "epoch": 1.1406165494862088, + "grad_norm": 2.078125, + "learning_rate": 0.018211028841947304, + "loss": 3.3501, + "mean_token_accuracy": 0.38677531480789185, + "num_tokens": 2156190299.0, + "step": 4218 + }, + { + "epoch": 1.1408869659275285, + "grad_norm": 2.453125, + "learning_rate": 0.018210085419126325, + "loss": 3.2643, + "mean_token_accuracy": 0.37539732456207275, + "num_tokens": 2156714573.0, + "step": 4219 + }, + { + "epoch": 1.1411573823688481, + "grad_norm": 2.390625, + "learning_rate": 0.018209141775078718, + "loss": 3.388, + "mean_token_accuracy": 0.37156200408935547, + "num_tokens": 2157238799.0, + "step": 4220 + }, + { + "epoch": 1.1414277988101675, + "grad_norm": 3.203125, + "learning_rate": 0.018208197909833444, + "loss": 3.5096, + "mean_token_accuracy": 0.3444429039955139, + "num_tokens": 2157699692.0, + "step": 4221 + }, + { + "epoch": 1.1416982152514872, + "grad_norm": 2.921875, + "learning_rate": 0.018207253823419457, + "loss": 3.5718, + "mean_token_accuracy": 0.37008291482925415, + "num_tokens": 2158223936.0, + "step": 4222 + }, + { + "epoch": 1.1419686316928068, + "grad_norm": 3.078125, + "learning_rate": 0.018206309515865732, + "loss": 3.4517, + "mean_token_accuracy": 0.3457261919975281, + "num_tokens": 2158748203.0, + "step": 4223 + }, + { + "epoch": 1.1422390481341265, + "grad_norm": 2.59375, + "learning_rate": 0.01820536498720124, + "loss": 3.4368, + "mean_token_accuracy": 0.3810802102088928, + "num_tokens": 2159272389.0, + "step": 4224 + }, + { + "epoch": 1.142509464575446, + "grad_norm": 2.984375, + "learning_rate": 0.018204420237454954, + "loss": 3.2846, + "mean_token_accuracy": 0.3650509715080261, + "num_tokens": 2159796651.0, + "step": 4225 + }, + { + "epoch": 1.1427798810167658, + "grad_norm": 3.15625, + "learning_rate": 0.018203475266655875, + "loss": 3.2748, + "mean_token_accuracy": 0.3703913688659668, + "num_tokens": 2160320848.0, + "step": 4226 + }, + { + "epoch": 1.1430502974580854, + "grad_norm": 3.15625, + "learning_rate": 0.018202530074832985, + "loss": 3.4084, + "mean_token_accuracy": 0.36897292733192444, + "num_tokens": 2160845129.0, + "step": 4227 + }, + { + "epoch": 1.143320713899405, + "grad_norm": 2.03125, + "learning_rate": 0.018201584662015293, + "loss": 3.1235, + "mean_token_accuracy": 0.38093435764312744, + "num_tokens": 2161369328.0, + "step": 4228 + }, + { + "epoch": 1.1435911303407247, + "grad_norm": 2.59375, + "learning_rate": 0.018200639028231803, + "loss": 3.2701, + "mean_token_accuracy": 0.3641623556613922, + "num_tokens": 2161893511.0, + "step": 4229 + }, + { + "epoch": 1.1438615467820443, + "grad_norm": 2.59375, + "learning_rate": 0.018199693173511538, + "loss": 3.3501, + "mean_token_accuracy": 0.3900376558303833, + "num_tokens": 2162417688.0, + "step": 4230 + }, + { + "epoch": 1.144131963223364, + "grad_norm": 0.67578125, + "learning_rate": 0.01819874709788351, + "loss": 11.1178, + "mean_token_accuracy": 2.730766573222354e-05, + "num_tokens": 2162941948.0, + "step": 4231 + }, + { + "epoch": 1.1444023796646836, + "grad_norm": 10.5625, + "learning_rate": 0.01819780080137675, + "loss": 3.8517, + "mean_token_accuracy": 0.3466651439666748, + "num_tokens": 2163384312.0, + "step": 4232 + }, + { + "epoch": 1.1446727961060033, + "grad_norm": 3.703125, + "learning_rate": 0.018196854284020295, + "loss": 3.5225, + "mean_token_accuracy": 0.37824589014053345, + "num_tokens": 2163908573.0, + "step": 4233 + }, + { + "epoch": 1.144943212547323, + "grad_norm": 2.015625, + "learning_rate": 0.018195907545843187, + "loss": 3.2115, + "mean_token_accuracy": 0.3896045684814453, + "num_tokens": 2164432832.0, + "step": 4234 + }, + { + "epoch": 1.1452136289886425, + "grad_norm": 3.3125, + "learning_rate": 0.018194960586874472, + "loss": 2.9711, + "mean_token_accuracy": 0.38765743374824524, + "num_tokens": 2164956999.0, + "step": 4235 + }, + { + "epoch": 1.1454840454299622, + "grad_norm": 2.640625, + "learning_rate": 0.018194013407143207, + "loss": 3.5222, + "mean_token_accuracy": 0.3511166274547577, + "num_tokens": 2165481274.0, + "step": 4236 + }, + { + "epoch": 1.1457544618712818, + "grad_norm": 2.859375, + "learning_rate": 0.018193066006678457, + "loss": 3.3384, + "mean_token_accuracy": 0.3736550211906433, + "num_tokens": 2166005548.0, + "step": 4237 + }, + { + "epoch": 1.1460248783126015, + "grad_norm": 3.9375, + "learning_rate": 0.018192118385509288, + "loss": 3.7016, + "mean_token_accuracy": 0.3340362310409546, + "num_tokens": 2166529778.0, + "step": 4238 + }, + { + "epoch": 1.1462952947539211, + "grad_norm": 3.015625, + "learning_rate": 0.018191170543664772, + "loss": 3.3706, + "mean_token_accuracy": 0.3922325372695923, + "num_tokens": 2167053888.0, + "step": 4239 + }, + { + "epoch": 1.1465657111952408, + "grad_norm": 3.859375, + "learning_rate": 0.018190222481174, + "loss": 3.1931, + "mean_token_accuracy": 0.34412097930908203, + "num_tokens": 2167570828.0, + "step": 4240 + }, + { + "epoch": 1.1468361276365604, + "grad_norm": 3.578125, + "learning_rate": 0.018189274198066055, + "loss": 3.5157, + "mean_token_accuracy": 0.3697833716869354, + "num_tokens": 2168038190.0, + "step": 4241 + }, + { + "epoch": 1.14710654407788, + "grad_norm": 3.28125, + "learning_rate": 0.01818832569437004, + "loss": 3.4884, + "mean_token_accuracy": 0.3483818769454956, + "num_tokens": 2168562434.0, + "step": 4242 + }, + { + "epoch": 1.1473769605191997, + "grad_norm": 3.015625, + "learning_rate": 0.018187376970115045, + "loss": 3.4334, + "mean_token_accuracy": 0.36405229568481445, + "num_tokens": 2169086602.0, + "step": 4243 + }, + { + "epoch": 1.147647376960519, + "grad_norm": 3.125, + "learning_rate": 0.018186428025330194, + "loss": 3.408, + "mean_token_accuracy": 0.31578320264816284, + "num_tokens": 2169610793.0, + "step": 4244 + }, + { + "epoch": 1.1479177934018387, + "grad_norm": 2.390625, + "learning_rate": 0.018185478860044596, + "loss": 3.3694, + "mean_token_accuracy": 0.3947591185569763, + "num_tokens": 2170101700.0, + "step": 4245 + }, + { + "epoch": 1.1481882098431584, + "grad_norm": 2.765625, + "learning_rate": 0.018184529474287378, + "loss": 3.4947, + "mean_token_accuracy": 0.37059593200683594, + "num_tokens": 2170609439.0, + "step": 4246 + }, + { + "epoch": 1.148458626284478, + "grad_norm": 3.078125, + "learning_rate": 0.018183579868087663, + "loss": 3.5827, + "mean_token_accuracy": 0.3521517515182495, + "num_tokens": 2171082160.0, + "step": 4247 + }, + { + "epoch": 1.1487290427257977, + "grad_norm": 3.25, + "learning_rate": 0.018182630041474593, + "loss": 3.5367, + "mean_token_accuracy": 0.3814247250556946, + "num_tokens": 2171606328.0, + "step": 4248 + }, + { + "epoch": 1.1489994591671173, + "grad_norm": 3.265625, + "learning_rate": 0.01818167999447731, + "loss": 3.4238, + "mean_token_accuracy": 0.36350929737091064, + "num_tokens": 2172130566.0, + "step": 4249 + }, + { + "epoch": 1.149269875608437, + "grad_norm": 3.015625, + "learning_rate": 0.018180729727124964, + "loss": 3.4482, + "mean_token_accuracy": 0.3678490221500397, + "num_tokens": 2172654813.0, + "step": 4250 + }, + { + "epoch": 1.1495402920497566, + "grad_norm": 2.8125, + "learning_rate": 0.01817977923944672, + "loss": 10.1657, + "mean_token_accuracy": 2.267032687086612e-05, + "num_tokens": 2173169967.0, + "step": 4251 + }, + { + "epoch": 1.1498107084910762, + "grad_norm": 7.125, + "learning_rate": 0.01817882853147173, + "loss": 3.946, + "mean_token_accuracy": 0.3280695974826813, + "num_tokens": 2173579579.0, + "step": 4252 + }, + { + "epoch": 1.150081124932396, + "grad_norm": 3.65625, + "learning_rate": 0.018177877603229167, + "loss": 3.3076, + "mean_token_accuracy": 0.40277180075645447, + "num_tokens": 2174040424.0, + "step": 4253 + }, + { + "epoch": 1.1503515413737155, + "grad_norm": 3.4375, + "learning_rate": 0.01817692645474821, + "loss": 3.3752, + "mean_token_accuracy": 0.3406551480293274, + "num_tokens": 2174564475.0, + "step": 4254 + }, + { + "epoch": 1.1506219578150352, + "grad_norm": 3.28125, + "learning_rate": 0.018175975086058047, + "loss": 3.6229, + "mean_token_accuracy": 0.3541192412376404, + "num_tokens": 2175088748.0, + "step": 4255 + }, + { + "epoch": 1.1508923742563548, + "grad_norm": 3.90625, + "learning_rate": 0.018175023497187862, + "loss": 3.5288, + "mean_token_accuracy": 0.3545626699924469, + "num_tokens": 2175612984.0, + "step": 4256 + }, + { + "epoch": 1.1511627906976745, + "grad_norm": 3.390625, + "learning_rate": 0.01817407168816686, + "loss": 3.4388, + "mean_token_accuracy": 0.38793402910232544, + "num_tokens": 2176113126.0, + "step": 4257 + }, + { + "epoch": 1.151433207138994, + "grad_norm": 3.828125, + "learning_rate": 0.018173119659024236, + "loss": 3.4679, + "mean_token_accuracy": 0.3588070869445801, + "num_tokens": 2176637339.0, + "step": 4258 + }, + { + "epoch": 1.1517036235803138, + "grad_norm": 2.796875, + "learning_rate": 0.018172167409789212, + "loss": 3.1946, + "mean_token_accuracy": 0.3668178915977478, + "num_tokens": 2177161507.0, + "step": 4259 + }, + { + "epoch": 1.1519740400216334, + "grad_norm": 2.28125, + "learning_rate": 0.018171214940490996, + "loss": 3.5267, + "mean_token_accuracy": 0.37787294387817383, + "num_tokens": 2177685596.0, + "step": 4260 + }, + { + "epoch": 1.152244456462953, + "grad_norm": 16.875, + "learning_rate": 0.018170262251158815, + "loss": 3.0328, + "mean_token_accuracy": 0.4340289235115051, + "num_tokens": 2178209877.0, + "step": 4261 + }, + { + "epoch": 1.1525148729042725, + "grad_norm": 3.6875, + "learning_rate": 0.018169309341821902, + "loss": 3.7619, + "mean_token_accuracy": 0.3271123170852661, + "num_tokens": 2178734047.0, + "step": 4262 + }, + { + "epoch": 1.152785289345592, + "grad_norm": 3.078125, + "learning_rate": 0.0181683562125095, + "loss": 3.4643, + "mean_token_accuracy": 0.36403006315231323, + "num_tokens": 2179258276.0, + "step": 4263 + }, + { + "epoch": 1.1530557057869117, + "grad_norm": 5.46875, + "learning_rate": 0.01816740286325084, + "loss": 3.8063, + "mean_token_accuracy": 0.33507275581359863, + "num_tokens": 2179782391.0, + "step": 4264 + }, + { + "epoch": 1.1533261222282314, + "grad_norm": 3.703125, + "learning_rate": 0.01816644929407519, + "loss": 3.3201, + "mean_token_accuracy": 0.3530053496360779, + "num_tokens": 2180289005.0, + "step": 4265 + }, + { + "epoch": 1.153596538669551, + "grad_norm": 2.9375, + "learning_rate": 0.018165495505011797, + "loss": 3.3954, + "mean_token_accuracy": 0.3859005868434906, + "num_tokens": 2180758708.0, + "step": 4266 + }, + { + "epoch": 1.1538669551108707, + "grad_norm": 2.90625, + "learning_rate": 0.01816454149608993, + "loss": 3.367, + "mean_token_accuracy": 0.3708151578903198, + "num_tokens": 2181266005.0, + "step": 4267 + }, + { + "epoch": 1.1541373715521903, + "grad_norm": 2.28125, + "learning_rate": 0.018163587267338865, + "loss": 3.3554, + "mean_token_accuracy": 0.3479270935058594, + "num_tokens": 2181790272.0, + "step": 4268 + }, + { + "epoch": 1.15440778799351, + "grad_norm": 3.03125, + "learning_rate": 0.01816263281878787, + "loss": 3.4585, + "mean_token_accuracy": 0.3790721595287323, + "num_tokens": 2182314542.0, + "step": 4269 + }, + { + "epoch": 1.1546782044348296, + "grad_norm": 3.15625, + "learning_rate": 0.018161678150466237, + "loss": 3.5471, + "mean_token_accuracy": 0.36310049891471863, + "num_tokens": 2182802090.0, + "step": 4270 + }, + { + "epoch": 1.1549486208761492, + "grad_norm": 24.5, + "learning_rate": 0.018160723262403256, + "loss": 18.8895, + "mean_token_accuracy": 0.03710126876831055, + "num_tokens": 2183326223.0, + "step": 4271 + }, + { + "epoch": 1.1552190373174689, + "grad_norm": 10.0, + "learning_rate": 0.01815976815462823, + "loss": 4.099, + "mean_token_accuracy": 0.2910924553871155, + "num_tokens": 2183850387.0, + "step": 4272 + }, + { + "epoch": 1.1554894537587885, + "grad_norm": 6.90625, + "learning_rate": 0.01815881282717046, + "loss": 2.9945, + "mean_token_accuracy": 0.38781699538230896, + "num_tokens": 2184374504.0, + "step": 4273 + }, + { + "epoch": 1.1557598702001082, + "grad_norm": 2.09375, + "learning_rate": 0.018157857280059263, + "loss": 3.5156, + "mean_token_accuracy": 0.3395603895187378, + "num_tokens": 2184898695.0, + "step": 4274 + }, + { + "epoch": 1.1560302866414278, + "grad_norm": 2.5625, + "learning_rate": 0.018156901513323952, + "loss": 3.4466, + "mean_token_accuracy": 0.3705715537071228, + "num_tokens": 2185422807.0, + "step": 4275 + }, + { + "epoch": 1.1563007030827475, + "grad_norm": 3.53125, + "learning_rate": 0.018155945526993855, + "loss": 3.2395, + "mean_token_accuracy": 0.37357455492019653, + "num_tokens": 2185925836.0, + "step": 4276 + }, + { + "epoch": 1.156571119524067, + "grad_norm": 3.265625, + "learning_rate": 0.018154989321098304, + "loss": 3.3863, + "mean_token_accuracy": 0.3681787848472595, + "num_tokens": 2186389543.0, + "step": 4277 + }, + { + "epoch": 1.1568415359653867, + "grad_norm": 3.59375, + "learning_rate": 0.01815403289566664, + "loss": 3.2375, + "mean_token_accuracy": 0.37769025564193726, + "num_tokens": 2186913773.0, + "step": 4278 + }, + { + "epoch": 1.1571119524067064, + "grad_norm": 2.65625, + "learning_rate": 0.01815307625072821, + "loss": 3.7058, + "mean_token_accuracy": 0.3380208909511566, + "num_tokens": 2187397324.0, + "step": 4279 + }, + { + "epoch": 1.157382368848026, + "grad_norm": 2.90625, + "learning_rate": 0.018152119386312362, + "loss": 3.2921, + "mean_token_accuracy": 0.3483750820159912, + "num_tokens": 2187921541.0, + "step": 4280 + }, + { + "epoch": 1.1576527852893457, + "grad_norm": 2.6875, + "learning_rate": 0.01815116230244846, + "loss": 3.322, + "mean_token_accuracy": 0.38556933403015137, + "num_tokens": 2188424164.0, + "step": 4281 + }, + { + "epoch": 1.1579232017306653, + "grad_norm": 2.921875, + "learning_rate": 0.018150204999165868, + "loss": 3.4006, + "mean_token_accuracy": 0.36908215284347534, + "num_tokens": 2188948371.0, + "step": 4282 + }, + { + "epoch": 1.158193618171985, + "grad_norm": 2.40625, + "learning_rate": 0.01814924747649396, + "loss": 3.0683, + "mean_token_accuracy": 0.4049363136291504, + "num_tokens": 2189465114.0, + "step": 4283 + }, + { + "epoch": 1.1584640346133046, + "grad_norm": 2.28125, + "learning_rate": 0.018148289734462114, + "loss": 3.4634, + "mean_token_accuracy": 0.37711548805236816, + "num_tokens": 2189989388.0, + "step": 4284 + }, + { + "epoch": 1.158734451054624, + "grad_norm": 3.359375, + "learning_rate": 0.018147331773099715, + "loss": 3.1791, + "mean_token_accuracy": 0.39035505056381226, + "num_tokens": 2190470583.0, + "step": 4285 + }, + { + "epoch": 1.1590048674959437, + "grad_norm": 2.453125, + "learning_rate": 0.018146373592436163, + "loss": 3.4423, + "mean_token_accuracy": 0.377552330493927, + "num_tokens": 2190994771.0, + "step": 4286 + }, + { + "epoch": 1.1592752839372633, + "grad_norm": 2.765625, + "learning_rate": 0.01814541519250085, + "loss": 3.4199, + "mean_token_accuracy": 0.35727524757385254, + "num_tokens": 2191519011.0, + "step": 4287 + }, + { + "epoch": 1.159545700378583, + "grad_norm": 2.515625, + "learning_rate": 0.01814445657332319, + "loss": 3.3124, + "mean_token_accuracy": 0.33778268098831177, + "num_tokens": 2192043201.0, + "step": 4288 + }, + { + "epoch": 1.1598161168199026, + "grad_norm": 3.375, + "learning_rate": 0.018143497734932585, + "loss": 3.4015, + "mean_token_accuracy": 0.3711232542991638, + "num_tokens": 2192563987.0, + "step": 4289 + }, + { + "epoch": 1.1600865332612222, + "grad_norm": 13.0, + "learning_rate": 0.018142538677358468, + "loss": 3.1195, + "mean_token_accuracy": 0.37124401330947876, + "num_tokens": 2193088256.0, + "step": 4290 + }, + { + "epoch": 1.1603569497025419, + "grad_norm": 34.75, + "learning_rate": 0.018141579400630256, + "loss": 16.7534, + "mean_token_accuracy": 0.0, + "num_tokens": 2193612524.0, + "step": 4291 + }, + { + "epoch": 1.1606273661438615, + "grad_norm": 4.84375, + "learning_rate": 0.018140619904777385, + "loss": 3.5942, + "mean_token_accuracy": 0.3579617738723755, + "num_tokens": 2194085462.0, + "step": 4292 + }, + { + "epoch": 1.1608977825851812, + "grad_norm": 3.09375, + "learning_rate": 0.018139660189829297, + "loss": 3.4436, + "mean_token_accuracy": 0.37148424983024597, + "num_tokens": 2194609633.0, + "step": 4293 + }, + { + "epoch": 1.1611681990265008, + "grad_norm": 3.6875, + "learning_rate": 0.018138700255815435, + "loss": 3.6527, + "mean_token_accuracy": 0.3454408347606659, + "num_tokens": 2195133738.0, + "step": 4294 + }, + { + "epoch": 1.1614386154678205, + "grad_norm": 3.890625, + "learning_rate": 0.018137740102765264, + "loss": 3.5037, + "mean_token_accuracy": 0.3722473680973053, + "num_tokens": 2195615221.0, + "step": 4295 + }, + { + "epoch": 1.16170903190914, + "grad_norm": 3.28125, + "learning_rate": 0.018136779730708232, + "loss": 3.6814, + "mean_token_accuracy": 0.3621617257595062, + "num_tokens": 2196088526.0, + "step": 4296 + }, + { + "epoch": 1.1619794483504597, + "grad_norm": 3.421875, + "learning_rate": 0.018135819139673808, + "loss": 3.407, + "mean_token_accuracy": 0.3368169069290161, + "num_tokens": 2196612727.0, + "step": 4297 + }, + { + "epoch": 1.1622498647917794, + "grad_norm": 2.75, + "learning_rate": 0.01813485832969147, + "loss": 3.3143, + "mean_token_accuracy": 0.39615917205810547, + "num_tokens": 2197136887.0, + "step": 4298 + }, + { + "epoch": 1.162520281233099, + "grad_norm": 3.1875, + "learning_rate": 0.018133897300790694, + "loss": 3.3828, + "mean_token_accuracy": 0.36521026492118835, + "num_tokens": 2197660980.0, + "step": 4299 + }, + { + "epoch": 1.1627906976744187, + "grad_norm": 3.4375, + "learning_rate": 0.018132936053000974, + "loss": 3.0796, + "mean_token_accuracy": 0.41586506366729736, + "num_tokens": 2198185178.0, + "step": 4300 + }, + { + "epoch": 1.1630611141157383, + "grad_norm": 3.375, + "learning_rate": 0.018131974586351792, + "loss": 3.3071, + "mean_token_accuracy": 0.3628764748573303, + "num_tokens": 2198709397.0, + "step": 4301 + }, + { + "epoch": 1.163331530557058, + "grad_norm": 3.140625, + "learning_rate": 0.01813101290087266, + "loss": 3.5372, + "mean_token_accuracy": 0.359422504901886, + "num_tokens": 2199221513.0, + "step": 4302 + }, + { + "epoch": 1.1636019469983774, + "grad_norm": 3.546875, + "learning_rate": 0.01813005099659308, + "loss": 3.7101, + "mean_token_accuracy": 0.35316401720046997, + "num_tokens": 2199745770.0, + "step": 4303 + }, + { + "epoch": 1.163872363439697, + "grad_norm": 3.671875, + "learning_rate": 0.01812908887354257, + "loss": 3.3053, + "mean_token_accuracy": 0.3723415732383728, + "num_tokens": 2200231441.0, + "step": 4304 + }, + { + "epoch": 1.1641427798810167, + "grad_norm": 2.609375, + "learning_rate": 0.018128126531750645, + "loss": 3.4062, + "mean_token_accuracy": 0.3849151134490967, + "num_tokens": 2200711603.0, + "step": 4305 + }, + { + "epoch": 1.1644131963223363, + "grad_norm": 2.96875, + "learning_rate": 0.018127163971246838, + "loss": 3.2652, + "mean_token_accuracy": 0.3979037404060364, + "num_tokens": 2201235843.0, + "step": 4306 + }, + { + "epoch": 1.164683612763656, + "grad_norm": 2.71875, + "learning_rate": 0.018126201192060678, + "loss": 3.2365, + "mean_token_accuracy": 0.3858869671821594, + "num_tokens": 2201744198.0, + "step": 4307 + }, + { + "epoch": 1.1649540292049756, + "grad_norm": 2.53125, + "learning_rate": 0.01812523819422171, + "loss": 3.1727, + "mean_token_accuracy": 0.39463087916374207, + "num_tokens": 2202268435.0, + "step": 4308 + }, + { + "epoch": 1.1652244456462952, + "grad_norm": 3.0, + "learning_rate": 0.018124274977759484, + "loss": 3.1261, + "mean_token_accuracy": 0.3774614930152893, + "num_tokens": 2202792709.0, + "step": 4309 + }, + { + "epoch": 1.1654948620876149, + "grad_norm": 3.40625, + "learning_rate": 0.018123311542703545, + "loss": 3.2714, + "mean_token_accuracy": 0.38324636220932007, + "num_tokens": 2203314741.0, + "step": 4310 + }, + { + "epoch": 1.1657652785289345, + "grad_norm": 8.875, + "learning_rate": 0.01812234788908346, + "loss": 13.667, + "mean_token_accuracy": 0.005979783833026886, + "num_tokens": 2203786026.0, + "step": 4311 + }, + { + "epoch": 1.1660356949702542, + "grad_norm": 5.59375, + "learning_rate": 0.018121384016928803, + "loss": 3.8297, + "mean_token_accuracy": 0.3103949725627899, + "num_tokens": 2204300297.0, + "step": 4312 + }, + { + "epoch": 1.1663061114115738, + "grad_norm": 1.78125, + "learning_rate": 0.018120419926269133, + "loss": 3.4102, + "mean_token_accuracy": 0.3660317063331604, + "num_tokens": 2204824509.0, + "step": 4313 + }, + { + "epoch": 1.1665765278528935, + "grad_norm": 3.40625, + "learning_rate": 0.018119455617134045, + "loss": 3.486, + "mean_token_accuracy": 0.3635581433773041, + "num_tokens": 2205320842.0, + "step": 4314 + }, + { + "epoch": 1.166846944294213, + "grad_norm": 2.828125, + "learning_rate": 0.01811849108955312, + "loss": 3.5283, + "mean_token_accuracy": 0.3577074408531189, + "num_tokens": 2205844993.0, + "step": 4315 + }, + { + "epoch": 1.1671173607355327, + "grad_norm": 2.859375, + "learning_rate": 0.018117526343555953, + "loss": 3.3743, + "mean_token_accuracy": 0.37708455324172974, + "num_tokens": 2206369188.0, + "step": 4316 + }, + { + "epoch": 1.1673877771768524, + "grad_norm": 2.640625, + "learning_rate": 0.01811656137917215, + "loss": 3.2281, + "mean_token_accuracy": 0.38737908005714417, + "num_tokens": 2206893413.0, + "step": 4317 + }, + { + "epoch": 1.167658193618172, + "grad_norm": 3.46875, + "learning_rate": 0.018115596196431313, + "loss": 3.4808, + "mean_token_accuracy": 0.36902302503585815, + "num_tokens": 2207417626.0, + "step": 4318 + }, + { + "epoch": 1.1679286100594917, + "grad_norm": 2.890625, + "learning_rate": 0.01811463079536306, + "loss": 3.4367, + "mean_token_accuracy": 0.3917604088783264, + "num_tokens": 2207941825.0, + "step": 4319 + }, + { + "epoch": 1.1681990265008113, + "grad_norm": 3.234375, + "learning_rate": 0.01811366517599701, + "loss": 3.1933, + "mean_token_accuracy": 0.36819320917129517, + "num_tokens": 2208465994.0, + "step": 4320 + }, + { + "epoch": 1.168469442942131, + "grad_norm": 2.6875, + "learning_rate": 0.018112699338362792, + "loss": 2.9536, + "mean_token_accuracy": 0.4064251780509949, + "num_tokens": 2208932314.0, + "step": 4321 + }, + { + "epoch": 1.1687398593834506, + "grad_norm": 2.578125, + "learning_rate": 0.018111733282490043, + "loss": 3.471, + "mean_token_accuracy": 0.34575921297073364, + "num_tokens": 2209456469.0, + "step": 4322 + }, + { + "epoch": 1.1690102758247702, + "grad_norm": 2.359375, + "learning_rate": 0.018110767008408406, + "loss": 3.3106, + "mean_token_accuracy": 0.3702065348625183, + "num_tokens": 2209980592.0, + "step": 4323 + }, + { + "epoch": 1.1692806922660899, + "grad_norm": 2.453125, + "learning_rate": 0.01810980051614752, + "loss": 3.464, + "mean_token_accuracy": 0.3825828433036804, + "num_tokens": 2210471655.0, + "step": 4324 + }, + { + "epoch": 1.1695511087074095, + "grad_norm": 2.890625, + "learning_rate": 0.018108833805737048, + "loss": 3.3937, + "mean_token_accuracy": 0.37352392077445984, + "num_tokens": 2210995929.0, + "step": 4325 + }, + { + "epoch": 1.169821525148729, + "grad_norm": 2.65625, + "learning_rate": 0.018107866877206648, + "loss": 3.504, + "mean_token_accuracy": 0.39119595289230347, + "num_tokens": 2211455461.0, + "step": 4326 + }, + { + "epoch": 1.1700919415900486, + "grad_norm": 3.765625, + "learning_rate": 0.018106899730585987, + "loss": 3.6447, + "mean_token_accuracy": 0.36705076694488525, + "num_tokens": 2211979646.0, + "step": 4327 + }, + { + "epoch": 1.1703623580313682, + "grad_norm": 2.875, + "learning_rate": 0.01810593236590475, + "loss": 3.3201, + "mean_token_accuracy": 0.36641860008239746, + "num_tokens": 2212503743.0, + "step": 4328 + }, + { + "epoch": 1.1706327744726879, + "grad_norm": 2.625, + "learning_rate": 0.018104964783192606, + "loss": 3.4124, + "mean_token_accuracy": 0.38737955689430237, + "num_tokens": 2213027988.0, + "step": 4329 + }, + { + "epoch": 1.1709031909140075, + "grad_norm": 2.6875, + "learning_rate": 0.018103996982479248, + "loss": 3.2202, + "mean_token_accuracy": 0.3734920918941498, + "num_tokens": 2213552063.0, + "step": 4330 + }, + { + "epoch": 1.1711736073553272, + "grad_norm": 22.75, + "learning_rate": 0.01810302896379437, + "loss": 11.9333, + "mean_token_accuracy": 4.0557653846917674e-05, + "num_tokens": 2214076090.0, + "step": 4331 + }, + { + "epoch": 1.1714440237966468, + "grad_norm": 7.5, + "learning_rate": 0.018102060727167676, + "loss": 3.4131, + "mean_token_accuracy": 0.358512818813324, + "num_tokens": 2214600248.0, + "step": 4332 + }, + { + "epoch": 1.1717144402379664, + "grad_norm": 3.34375, + "learning_rate": 0.018101092272628877, + "loss": 3.4867, + "mean_token_accuracy": 0.3732549250125885, + "num_tokens": 2215077565.0, + "step": 4333 + }, + { + "epoch": 1.171984856679286, + "grad_norm": 4.0625, + "learning_rate": 0.018100123600207678, + "loss": 3.5031, + "mean_token_accuracy": 0.37099140882492065, + "num_tokens": 2215601722.0, + "step": 4334 + }, + { + "epoch": 1.1722552731206057, + "grad_norm": 3.015625, + "learning_rate": 0.01809915470993381, + "loss": 3.5549, + "mean_token_accuracy": 0.35394811630249023, + "num_tokens": 2216125962.0, + "step": 4335 + }, + { + "epoch": 1.1725256895619254, + "grad_norm": 2.71875, + "learning_rate": 0.018098185601837, + "loss": 3.5135, + "mean_token_accuracy": 0.39391225576400757, + "num_tokens": 2216567363.0, + "step": 4336 + }, + { + "epoch": 1.172796106003245, + "grad_norm": 3.5625, + "learning_rate": 0.018097216275946976, + "loss": 3.1753, + "mean_token_accuracy": 0.38970398902893066, + "num_tokens": 2217091428.0, + "step": 4337 + }, + { + "epoch": 1.1730665224445647, + "grad_norm": 3.265625, + "learning_rate": 0.018096246732293486, + "loss": 3.2941, + "mean_token_accuracy": 0.3886149525642395, + "num_tokens": 2217615555.0, + "step": 4338 + }, + { + "epoch": 1.1733369388858843, + "grad_norm": 2.5, + "learning_rate": 0.01809527697090628, + "loss": 3.1097, + "mean_token_accuracy": 0.40271300077438354, + "num_tokens": 2218139826.0, + "step": 4339 + }, + { + "epoch": 1.173607355327204, + "grad_norm": 3.1875, + "learning_rate": 0.018094306991815107, + "loss": 3.2709, + "mean_token_accuracy": 0.3559296727180481, + "num_tokens": 2218663914.0, + "step": 4340 + }, + { + "epoch": 1.1738777717685236, + "grad_norm": 2.65625, + "learning_rate": 0.018093336795049736, + "loss": 3.1523, + "mean_token_accuracy": 0.3981948792934418, + "num_tokens": 2219188098.0, + "step": 4341 + }, + { + "epoch": 1.1741481882098432, + "grad_norm": 2.984375, + "learning_rate": 0.01809236638063993, + "loss": 3.3851, + "mean_token_accuracy": 0.375224232673645, + "num_tokens": 2219712376.0, + "step": 4342 + }, + { + "epoch": 1.1744186046511629, + "grad_norm": 2.828125, + "learning_rate": 0.01809139574861547, + "loss": 3.4044, + "mean_token_accuracy": 0.3747404217720032, + "num_tokens": 2220236487.0, + "step": 4343 + }, + { + "epoch": 1.1746890210924823, + "grad_norm": 2.59375, + "learning_rate": 0.018090424899006134, + "loss": 3.2087, + "mean_token_accuracy": 0.38926756381988525, + "num_tokens": 2220760710.0, + "step": 4344 + }, + { + "epoch": 1.174959437533802, + "grad_norm": 3.09375, + "learning_rate": 0.01808945383184171, + "loss": 3.1356, + "mean_token_accuracy": 0.37972021102905273, + "num_tokens": 2221284857.0, + "step": 4345 + }, + { + "epoch": 1.1752298539751216, + "grad_norm": 3.359375, + "learning_rate": 0.01808848254715199, + "loss": 3.5538, + "mean_token_accuracy": 0.348788857460022, + "num_tokens": 2221809038.0, + "step": 4346 + }, + { + "epoch": 1.1755002704164412, + "grad_norm": 2.953125, + "learning_rate": 0.01808751104496678, + "loss": 3.3151, + "mean_token_accuracy": 0.37100648880004883, + "num_tokens": 2222333204.0, + "step": 4347 + }, + { + "epoch": 1.1757706868577609, + "grad_norm": 3.28125, + "learning_rate": 0.01808653932531589, + "loss": 3.3023, + "mean_token_accuracy": 0.3752196133136749, + "num_tokens": 2222857448.0, + "step": 4348 + }, + { + "epoch": 1.1760411032990805, + "grad_norm": 18.125, + "learning_rate": 0.018085567388229133, + "loss": 3.1834, + "mean_token_accuracy": 0.4174478352069855, + "num_tokens": 2223316038.0, + "step": 4349 + }, + { + "epoch": 1.1763115197404002, + "grad_norm": 3.296875, + "learning_rate": 0.01808459523373633, + "loss": 3.3913, + "mean_token_accuracy": 0.3877435326576233, + "num_tokens": 2223792430.0, + "step": 4350 + }, + { + "epoch": 1.1765819361817198, + "grad_norm": 15.5, + "learning_rate": 0.018083622861867314, + "loss": 10.9751, + "mean_token_accuracy": 0.010174726136028767, + "num_tokens": 2224248356.0, + "step": 4351 + }, + { + "epoch": 1.1768523526230394, + "grad_norm": 15.4375, + "learning_rate": 0.018082650272651917, + "loss": 4.0647, + "mean_token_accuracy": 0.3102262616157532, + "num_tokens": 2224772587.0, + "step": 4352 + }, + { + "epoch": 1.177122769064359, + "grad_norm": 3.0, + "learning_rate": 0.018081677466119977, + "loss": 3.607, + "mean_token_accuracy": 0.347734272480011, + "num_tokens": 2225296766.0, + "step": 4353 + }, + { + "epoch": 1.1773931855056787, + "grad_norm": 2.4375, + "learning_rate": 0.01808070444230135, + "loss": 3.5198, + "mean_token_accuracy": 0.37654992938041687, + "num_tokens": 2225763853.0, + "step": 4354 + }, + { + "epoch": 1.1776636019469984, + "grad_norm": 3.96875, + "learning_rate": 0.018079731201225885, + "loss": 3.496, + "mean_token_accuracy": 0.3562282621860504, + "num_tokens": 2226287847.0, + "step": 4355 + }, + { + "epoch": 1.177934018388318, + "grad_norm": 3.109375, + "learning_rate": 0.01807875774292345, + "loss": 3.2404, + "mean_token_accuracy": 0.3920617699623108, + "num_tokens": 2226812044.0, + "step": 4356 + }, + { + "epoch": 1.1782044348296377, + "grad_norm": 3.421875, + "learning_rate": 0.01807778406742391, + "loss": 3.4161, + "mean_token_accuracy": 0.3723483085632324, + "num_tokens": 2227316871.0, + "step": 4357 + }, + { + "epoch": 1.1784748512709573, + "grad_norm": 3.3125, + "learning_rate": 0.018076810174757138, + "loss": 3.5293, + "mean_token_accuracy": 0.3558696210384369, + "num_tokens": 2227831224.0, + "step": 4358 + }, + { + "epoch": 1.178745267712277, + "grad_norm": 3.515625, + "learning_rate": 0.018075836064953017, + "loss": 3.3038, + "mean_token_accuracy": 0.39740872383117676, + "num_tokens": 2228309842.0, + "step": 4359 + }, + { + "epoch": 1.1790156841535966, + "grad_norm": 2.921875, + "learning_rate": 0.018074861738041438, + "loss": 3.1343, + "mean_token_accuracy": 0.3576640486717224, + "num_tokens": 2228834119.0, + "step": 4360 + }, + { + "epoch": 1.1792861005949162, + "grad_norm": 2.65625, + "learning_rate": 0.018073887194052295, + "loss": 3.4747, + "mean_token_accuracy": 0.3823325037956238, + "num_tokens": 2229319921.0, + "step": 4361 + }, + { + "epoch": 1.1795565170362359, + "grad_norm": 4.21875, + "learning_rate": 0.01807291243301549, + "loss": 3.4929, + "mean_token_accuracy": 0.3505094051361084, + "num_tokens": 2229844189.0, + "step": 4362 + }, + { + "epoch": 1.1798269334775555, + "grad_norm": 2.796875, + "learning_rate": 0.018071937454960933, + "loss": 2.9205, + "mean_token_accuracy": 0.3926593065261841, + "num_tokens": 2230368441.0, + "step": 4363 + }, + { + "epoch": 1.1800973499188752, + "grad_norm": 2.515625, + "learning_rate": 0.018070962259918538, + "loss": 3.5208, + "mean_token_accuracy": 0.3905014991760254, + "num_tokens": 2230799723.0, + "step": 4364 + }, + { + "epoch": 1.1803677663601948, + "grad_norm": 2.984375, + "learning_rate": 0.01806998684791822, + "loss": 3.2303, + "mean_token_accuracy": 0.39039477705955505, + "num_tokens": 2231300962.0, + "step": 4365 + }, + { + "epoch": 1.1806381828015144, + "grad_norm": 2.171875, + "learning_rate": 0.018069011218989923, + "loss": 3.2543, + "mean_token_accuracy": 0.37636154890060425, + "num_tokens": 2231825204.0, + "step": 4366 + }, + { + "epoch": 1.1809085992428339, + "grad_norm": 3.328125, + "learning_rate": 0.018068035373163566, + "loss": 3.586, + "mean_token_accuracy": 0.35871684551239014, + "num_tokens": 2232349484.0, + "step": 4367 + }, + { + "epoch": 1.1811790156841535, + "grad_norm": 3.546875, + "learning_rate": 0.018067059310469105, + "loss": 3.482, + "mean_token_accuracy": 0.36733806133270264, + "num_tokens": 2232862992.0, + "step": 4368 + }, + { + "epoch": 1.1814494321254732, + "grad_norm": 2.6875, + "learning_rate": 0.01806608303093648, + "loss": 3.6065, + "mean_token_accuracy": 0.37116992473602295, + "num_tokens": 2233329394.0, + "step": 4369 + }, + { + "epoch": 1.1817198485667928, + "grad_norm": 3.0, + "learning_rate": 0.018065106534595643, + "loss": 3.3001, + "mean_token_accuracy": 0.3783606290817261, + "num_tokens": 2233811566.0, + "step": 4370 + }, + { + "epoch": 1.1819902650081124, + "grad_norm": 1.421875, + "learning_rate": 0.018064129821476566, + "loss": 10.8687, + "mean_token_accuracy": 2.5275261577917263e-05, + "num_tokens": 2234335823.0, + "step": 4371 + }, + { + "epoch": 1.182260681449432, + "grad_norm": 6.84375, + "learning_rate": 0.018063152891609206, + "loss": 3.8557, + "mean_token_accuracy": 0.33149254322052, + "num_tokens": 2234859946.0, + "step": 4372 + }, + { + "epoch": 1.1825310978907517, + "grad_norm": 2.890625, + "learning_rate": 0.018062175745023546, + "loss": 3.5378, + "mean_token_accuracy": 0.36670517921447754, + "num_tokens": 2235384221.0, + "step": 4373 + }, + { + "epoch": 1.1828015143320714, + "grad_norm": 3.203125, + "learning_rate": 0.018061198381749567, + "loss": 3.3529, + "mean_token_accuracy": 0.3734428286552429, + "num_tokens": 2235868589.0, + "step": 4374 + }, + { + "epoch": 1.183071930773391, + "grad_norm": 3.40625, + "learning_rate": 0.018060220801817257, + "loss": 3.5987, + "mean_token_accuracy": 0.3533098101615906, + "num_tokens": 2236392857.0, + "step": 4375 + }, + { + "epoch": 1.1833423472147107, + "grad_norm": 3.078125, + "learning_rate": 0.018059243005256607, + "loss": 3.5042, + "mean_token_accuracy": 0.376350998878479, + "num_tokens": 2236884909.0, + "step": 4376 + }, + { + "epoch": 1.1836127636560303, + "grad_norm": 3.4375, + "learning_rate": 0.01805826499209762, + "loss": 3.6098, + "mean_token_accuracy": 0.32828933000564575, + "num_tokens": 2237409149.0, + "step": 4377 + }, + { + "epoch": 1.18388318009735, + "grad_norm": 2.65625, + "learning_rate": 0.01805728676237031, + "loss": 3.2514, + "mean_token_accuracy": 0.3985940217971802, + "num_tokens": 2237933435.0, + "step": 4378 + }, + { + "epoch": 1.1841535965386696, + "grad_norm": 3.03125, + "learning_rate": 0.01805630831610469, + "loss": 3.1552, + "mean_token_accuracy": 0.3734790086746216, + "num_tokens": 2238457607.0, + "step": 4379 + }, + { + "epoch": 1.1844240129799892, + "grad_norm": 17.5, + "learning_rate": 0.018055329653330777, + "loss": 3.6948, + "mean_token_accuracy": 0.35951441526412964, + "num_tokens": 2238970919.0, + "step": 4380 + }, + { + "epoch": 1.1846944294213089, + "grad_norm": 2.6875, + "learning_rate": 0.0180543507740786, + "loss": 3.5916, + "mean_token_accuracy": 0.35316014289855957, + "num_tokens": 2239495011.0, + "step": 4381 + }, + { + "epoch": 1.1849648458626285, + "grad_norm": 2.390625, + "learning_rate": 0.0180533716783782, + "loss": 3.4769, + "mean_token_accuracy": 0.36249157786369324, + "num_tokens": 2240019185.0, + "step": 4382 + }, + { + "epoch": 1.1852352623039482, + "grad_norm": 3.421875, + "learning_rate": 0.01805239236625961, + "loss": 3.2248, + "mean_token_accuracy": 0.3732716739177704, + "num_tokens": 2240543433.0, + "step": 4383 + }, + { + "epoch": 1.1855056787452678, + "grad_norm": 3.09375, + "learning_rate": 0.018051412837752884, + "loss": 3.4082, + "mean_token_accuracy": 0.38689637184143066, + "num_tokens": 2241067712.0, + "step": 4384 + }, + { + "epoch": 1.1857760951865872, + "grad_norm": 2.875, + "learning_rate": 0.018050433092888077, + "loss": 3.2539, + "mean_token_accuracy": 0.3712351620197296, + "num_tokens": 2241591857.0, + "step": 4385 + }, + { + "epoch": 1.1860465116279069, + "grad_norm": 5.375, + "learning_rate": 0.018049453131695245, + "loss": 3.2391, + "mean_token_accuracy": 0.432248592376709, + "num_tokens": 2242116052.0, + "step": 4386 + }, + { + "epoch": 1.1863169280692265, + "grad_norm": 2.484375, + "learning_rate": 0.01804847295420446, + "loss": 3.5354, + "mean_token_accuracy": 0.38690975308418274, + "num_tokens": 2242629777.0, + "step": 4387 + }, + { + "epoch": 1.1865873445105461, + "grad_norm": 3.6875, + "learning_rate": 0.018047492560445803, + "loss": 3.4383, + "mean_token_accuracy": 0.3694656491279602, + "num_tokens": 2243150803.0, + "step": 4388 + }, + { + "epoch": 1.1868577609518658, + "grad_norm": 2.5, + "learning_rate": 0.018046511950449342, + "loss": 3.4142, + "mean_token_accuracy": 0.3912012577056885, + "num_tokens": 2243675072.0, + "step": 4389 + }, + { + "epoch": 1.1871281773931854, + "grad_norm": 3.65625, + "learning_rate": 0.018045531124245177, + "loss": 3.3656, + "mean_token_accuracy": 0.3669663667678833, + "num_tokens": 2244199343.0, + "step": 4390 + }, + { + "epoch": 1.187398593834505, + "grad_norm": 25.625, + "learning_rate": 0.01804455008186339, + "loss": 16.0299, + "mean_token_accuracy": 0.010506518185138702, + "num_tokens": 2244669799.0, + "step": 4391 + }, + { + "epoch": 1.1876690102758247, + "grad_norm": 6.6875, + "learning_rate": 0.018043568823334095, + "loss": 3.877, + "mean_token_accuracy": 0.29111504554748535, + "num_tokens": 2245194029.0, + "step": 4392 + }, + { + "epoch": 1.1879394267171444, + "grad_norm": 2.8125, + "learning_rate": 0.018042587348687397, + "loss": 3.579, + "mean_token_accuracy": 0.347652792930603, + "num_tokens": 2245718278.0, + "step": 4393 + }, + { + "epoch": 1.188209843158464, + "grad_norm": 3.546875, + "learning_rate": 0.018041605657953406, + "loss": 3.6445, + "mean_token_accuracy": 0.3453386425971985, + "num_tokens": 2246242441.0, + "step": 4394 + }, + { + "epoch": 1.1884802595997837, + "grad_norm": 2.84375, + "learning_rate": 0.018040623751162246, + "loss": 3.1749, + "mean_token_accuracy": 0.3914402723312378, + "num_tokens": 2246683949.0, + "step": 4395 + }, + { + "epoch": 1.1887506760411033, + "grad_norm": 3.390625, + "learning_rate": 0.018039641628344042, + "loss": 3.2189, + "mean_token_accuracy": 0.38480281829833984, + "num_tokens": 2247208196.0, + "step": 4396 + }, + { + "epoch": 1.189021092482423, + "grad_norm": 3.015625, + "learning_rate": 0.01803865928952893, + "loss": 3.1427, + "mean_token_accuracy": 0.3840372562408447, + "num_tokens": 2247685550.0, + "step": 4397 + }, + { + "epoch": 1.1892915089237426, + "grad_norm": 3.421875, + "learning_rate": 0.018037676734747054, + "loss": 3.5695, + "mean_token_accuracy": 0.3666077256202698, + "num_tokens": 2248209818.0, + "step": 4398 + }, + { + "epoch": 1.1895619253650622, + "grad_norm": 2.484375, + "learning_rate": 0.01803669396402856, + "loss": 3.2536, + "mean_token_accuracy": 0.3725336194038391, + "num_tokens": 2248698412.0, + "step": 4399 + }, + { + "epoch": 1.1898323418063819, + "grad_norm": 2.3125, + "learning_rate": 0.0180357109774036, + "loss": 3.3633, + "mean_token_accuracy": 0.3802858293056488, + "num_tokens": 2249216626.0, + "step": 4400 + }, + { + "epoch": 1.1901027582477015, + "grad_norm": 2.40625, + "learning_rate": 0.018034727774902337, + "loss": 3.3751, + "mean_token_accuracy": 0.3686602711677551, + "num_tokens": 2249740902.0, + "step": 4401 + }, + { + "epoch": 1.1903731746890212, + "grad_norm": 2.4375, + "learning_rate": 0.01803374435655494, + "loss": 3.2651, + "mean_token_accuracy": 0.38237977027893066, + "num_tokens": 2250265079.0, + "step": 4402 + }, + { + "epoch": 1.1906435911303408, + "grad_norm": 2.59375, + "learning_rate": 0.018032760722391575, + "loss": 3.3015, + "mean_token_accuracy": 0.37200239300727844, + "num_tokens": 2250789249.0, + "step": 4403 + }, + { + "epoch": 1.1909140075716604, + "grad_norm": 2.921875, + "learning_rate": 0.018031776872442437, + "loss": 3.3081, + "mean_token_accuracy": 0.3846932351589203, + "num_tokens": 2251313409.0, + "step": 4404 + }, + { + "epoch": 1.19118442401298, + "grad_norm": 3.703125, + "learning_rate": 0.0180307928067377, + "loss": 3.3913, + "mean_token_accuracy": 0.37984126806259155, + "num_tokens": 2251830579.0, + "step": 4405 + }, + { + "epoch": 1.1914548404542997, + "grad_norm": 2.765625, + "learning_rate": 0.018029808525307566, + "loss": 3.2274, + "mean_token_accuracy": 0.3725535273551941, + "num_tokens": 2252354815.0, + "step": 4406 + }, + { + "epoch": 1.1917252568956194, + "grad_norm": 3.640625, + "learning_rate": 0.01802882402818223, + "loss": 3.2118, + "mean_token_accuracy": 0.38757145404815674, + "num_tokens": 2252868241.0, + "step": 4407 + }, + { + "epoch": 1.191995673336939, + "grad_norm": 2.703125, + "learning_rate": 0.018027839315391907, + "loss": 3.1676, + "mean_token_accuracy": 0.38314706087112427, + "num_tokens": 2253392415.0, + "step": 4408 + }, + { + "epoch": 1.1922660897782584, + "grad_norm": 3.59375, + "learning_rate": 0.018026854386966804, + "loss": 3.4876, + "mean_token_accuracy": 0.36790433526039124, + "num_tokens": 2253916692.0, + "step": 4409 + }, + { + "epoch": 1.192536506219578, + "grad_norm": 2.703125, + "learning_rate": 0.018025869242937146, + "loss": 3.1333, + "mean_token_accuracy": 0.3774029612541199, + "num_tokens": 2254440880.0, + "step": 4410 + }, + { + "epoch": 1.1928069226608977, + "grad_norm": 1.9296875, + "learning_rate": 0.018024883883333155, + "loss": 10.1083, + "mean_token_accuracy": 0.0001348068763036281, + "num_tokens": 2254933212.0, + "step": 4411 + }, + { + "epoch": 1.1930773391022174, + "grad_norm": 11.5625, + "learning_rate": 0.018023898308185068, + "loss": 4.1968, + "mean_token_accuracy": 0.27545973658561707, + "num_tokens": 2255452648.0, + "step": 4412 + }, + { + "epoch": 1.193347755543537, + "grad_norm": 2.921875, + "learning_rate": 0.018022912517523124, + "loss": 3.4947, + "mean_token_accuracy": 0.34599411487579346, + "num_tokens": 2255976772.0, + "step": 4413 + }, + { + "epoch": 1.1936181719848566, + "grad_norm": 2.15625, + "learning_rate": 0.018021926511377573, + "loss": 3.3169, + "mean_token_accuracy": 0.3744570016860962, + "num_tokens": 2256500910.0, + "step": 4414 + }, + { + "epoch": 1.1938885884261763, + "grad_norm": 16.875, + "learning_rate": 0.018020940289778666, + "loss": 3.3525, + "mean_token_accuracy": 0.3793732821941376, + "num_tokens": 2256954715.0, + "step": 4415 + }, + { + "epoch": 1.194159004867496, + "grad_norm": 2.796875, + "learning_rate": 0.018019953852756662, + "loss": 3.327, + "mean_token_accuracy": 0.3632029592990875, + "num_tokens": 2257478954.0, + "step": 4416 + }, + { + "epoch": 1.1944294213088156, + "grad_norm": 2.625, + "learning_rate": 0.01801896720034183, + "loss": 3.4115, + "mean_token_accuracy": 0.3678201138973236, + "num_tokens": 2258003146.0, + "step": 4417 + }, + { + "epoch": 1.1946998377501352, + "grad_norm": 3.25, + "learning_rate": 0.018017980332564444, + "loss": 3.4798, + "mean_token_accuracy": 0.3719342052936554, + "num_tokens": 2258527398.0, + "step": 4418 + }, + { + "epoch": 1.1949702541914549, + "grad_norm": 3.09375, + "learning_rate": 0.018016993249454785, + "loss": 3.1011, + "mean_token_accuracy": 0.37303170561790466, + "num_tokens": 2259051558.0, + "step": 4419 + }, + { + "epoch": 1.1952406706327745, + "grad_norm": 3.09375, + "learning_rate": 0.018016005951043135, + "loss": 3.4908, + "mean_token_accuracy": 0.3655598759651184, + "num_tokens": 2259575819.0, + "step": 4420 + }, + { + "epoch": 1.1955110870740941, + "grad_norm": 2.921875, + "learning_rate": 0.018015018437359788, + "loss": 3.4992, + "mean_token_accuracy": 0.36238211393356323, + "num_tokens": 2260060175.0, + "step": 4421 + }, + { + "epoch": 1.1957815035154138, + "grad_norm": 2.078125, + "learning_rate": 0.018014030708435047, + "loss": 3.1072, + "mean_token_accuracy": 0.39827489852905273, + "num_tokens": 2260573401.0, + "step": 4422 + }, + { + "epoch": 1.1960519199567334, + "grad_norm": 3.3125, + "learning_rate": 0.018013042764299218, + "loss": 3.0733, + "mean_token_accuracy": 0.3855803608894348, + "num_tokens": 2261036757.0, + "step": 4423 + }, + { + "epoch": 1.196322336398053, + "grad_norm": 3.125, + "learning_rate": 0.018012054604982612, + "loss": 3.6005, + "mean_token_accuracy": 0.3588961660861969, + "num_tokens": 2261560909.0, + "step": 4424 + }, + { + "epoch": 1.1965927528393727, + "grad_norm": 3.359375, + "learning_rate": 0.01801106623051555, + "loss": 3.624, + "mean_token_accuracy": 0.35758334398269653, + "num_tokens": 2262085041.0, + "step": 4425 + }, + { + "epoch": 1.1968631692806924, + "grad_norm": 2.78125, + "learning_rate": 0.01801007764092836, + "loss": 3.4992, + "mean_token_accuracy": 0.36550140380859375, + "num_tokens": 2262609325.0, + "step": 4426 + }, + { + "epoch": 1.1971335857220118, + "grad_norm": 3.0, + "learning_rate": 0.01800908883625137, + "loss": 3.5906, + "mean_token_accuracy": 0.38498347997665405, + "num_tokens": 2263071954.0, + "step": 4427 + }, + { + "epoch": 1.1974040021633314, + "grad_norm": 3.359375, + "learning_rate": 0.01800809981651492, + "loss": 3.3683, + "mean_token_accuracy": 0.38362783193588257, + "num_tokens": 2263596207.0, + "step": 4428 + }, + { + "epoch": 1.197674418604651, + "grad_norm": 3.984375, + "learning_rate": 0.01800711058174936, + "loss": 3.3334, + "mean_token_accuracy": 0.39271286129951477, + "num_tokens": 2264120446.0, + "step": 4429 + }, + { + "epoch": 1.1979448350459707, + "grad_norm": 4.09375, + "learning_rate": 0.018006121131985044, + "loss": 3.7308, + "mean_token_accuracy": 0.3144552707672119, + "num_tokens": 2264644596.0, + "step": 4430 + }, + { + "epoch": 1.1982152514872904, + "grad_norm": 36.25, + "learning_rate": 0.018005131467252324, + "loss": 13.1217, + "mean_token_accuracy": 0.037756532430648804, + "num_tokens": 2265144219.0, + "step": 4431 + }, + { + "epoch": 1.19848566792861, + "grad_norm": 6.3125, + "learning_rate": 0.01800414158758157, + "loss": 3.6808, + "mean_token_accuracy": 0.34846168756484985, + "num_tokens": 2265668348.0, + "step": 4432 + }, + { + "epoch": 1.1987560843699296, + "grad_norm": 2.53125, + "learning_rate": 0.018003151493003158, + "loss": 3.6502, + "mean_token_accuracy": 0.35231584310531616, + "num_tokens": 2266192442.0, + "step": 4433 + }, + { + "epoch": 1.1990265008112493, + "grad_norm": 3.234375, + "learning_rate": 0.01800216118354746, + "loss": 3.5627, + "mean_token_accuracy": 0.346628338098526, + "num_tokens": 2266707924.0, + "step": 4434 + }, + { + "epoch": 1.199296917252569, + "grad_norm": 8.625, + "learning_rate": 0.018001170659244864, + "loss": 3.5498, + "mean_token_accuracy": 0.36748939752578735, + "num_tokens": 2267203578.0, + "step": 4435 + }, + { + "epoch": 1.1995673336938886, + "grad_norm": 2.359375, + "learning_rate": 0.018000179920125765, + "loss": 3.5692, + "mean_token_accuracy": 0.344534307718277, + "num_tokens": 2267727715.0, + "step": 4436 + }, + { + "epoch": 1.1998377501352082, + "grad_norm": 2.140625, + "learning_rate": 0.01799918896622056, + "loss": 3.113, + "mean_token_accuracy": 0.42462557554244995, + "num_tokens": 2268251913.0, + "step": 4437 + }, + { + "epoch": 1.2001081665765279, + "grad_norm": 2.828125, + "learning_rate": 0.017998197797559654, + "loss": 3.4274, + "mean_token_accuracy": 0.37601184844970703, + "num_tokens": 2268732851.0, + "step": 4438 + }, + { + "epoch": 1.2003785830178475, + "grad_norm": 3.8125, + "learning_rate": 0.017997206414173458, + "loss": 2.8219, + "mean_token_accuracy": 0.4235619306564331, + "num_tokens": 2269257065.0, + "step": 4439 + }, + { + "epoch": 1.2006489994591671, + "grad_norm": 3.40625, + "learning_rate": 0.017996214816092395, + "loss": 3.4376, + "mean_token_accuracy": 0.37326619029045105, + "num_tokens": 2269781246.0, + "step": 4440 + }, + { + "epoch": 1.2009194159004868, + "grad_norm": 3.421875, + "learning_rate": 0.017995223003346883, + "loss": 3.3367, + "mean_token_accuracy": 0.370128870010376, + "num_tokens": 2270292198.0, + "step": 4441 + }, + { + "epoch": 1.2011898323418064, + "grad_norm": 2.78125, + "learning_rate": 0.01799423097596736, + "loss": 3.4407, + "mean_token_accuracy": 0.36713144183158875, + "num_tokens": 2270816297.0, + "step": 4442 + }, + { + "epoch": 1.201460248783126, + "grad_norm": 3.578125, + "learning_rate": 0.017993238733984264, + "loss": 3.5163, + "mean_token_accuracy": 0.3558102250099182, + "num_tokens": 2271340526.0, + "step": 4443 + }, + { + "epoch": 1.2017306652244457, + "grad_norm": 2.78125, + "learning_rate": 0.017992246277428035, + "loss": 3.2719, + "mean_token_accuracy": 0.3933027386665344, + "num_tokens": 2271864516.0, + "step": 4444 + }, + { + "epoch": 1.2020010816657654, + "grad_norm": 3.140625, + "learning_rate": 0.017991253606329125, + "loss": 3.4335, + "mean_token_accuracy": 0.37959355115890503, + "num_tokens": 2272347351.0, + "step": 4445 + }, + { + "epoch": 1.202271498107085, + "grad_norm": 2.90625, + "learning_rate": 0.017990260720717997, + "loss": 3.2835, + "mean_token_accuracy": 0.37112754583358765, + "num_tokens": 2272871388.0, + "step": 4446 + }, + { + "epoch": 1.2025419145484046, + "grad_norm": 4.1875, + "learning_rate": 0.017989267620625114, + "loss": 3.5528, + "mean_token_accuracy": 0.3460700213909149, + "num_tokens": 2273395651.0, + "step": 4447 + }, + { + "epoch": 1.2028123309897243, + "grad_norm": 3.515625, + "learning_rate": 0.017988274306080947, + "loss": 3.3859, + "mean_token_accuracy": 0.3712960481643677, + "num_tokens": 2273919813.0, + "step": 4448 + }, + { + "epoch": 1.203082747431044, + "grad_norm": 3.3125, + "learning_rate": 0.01798728077711597, + "loss": 3.2911, + "mean_token_accuracy": 0.3884727954864502, + "num_tokens": 2274415218.0, + "step": 4449 + }, + { + "epoch": 1.2033531638723634, + "grad_norm": 3.140625, + "learning_rate": 0.01798628703376067, + "loss": 3.4332, + "mean_token_accuracy": 0.3636930584907532, + "num_tokens": 2274939338.0, + "step": 4450 + }, + { + "epoch": 1.203623580313683, + "grad_norm": 16.125, + "learning_rate": 0.01798529307604554, + "loss": 15.0017, + "mean_token_accuracy": 0.0051892539486289024, + "num_tokens": 2275463510.0, + "step": 4451 + }, + { + "epoch": 1.2038939967550026, + "grad_norm": 9.5625, + "learning_rate": 0.017984298904001078, + "loss": 3.9004, + "mean_token_accuracy": 0.3119100332260132, + "num_tokens": 2275987778.0, + "step": 4452 + }, + { + "epoch": 1.2041644131963223, + "grad_norm": 2.1875, + "learning_rate": 0.017983304517657787, + "loss": 3.5241, + "mean_token_accuracy": 0.33945637941360474, + "num_tokens": 2276511930.0, + "step": 4453 + }, + { + "epoch": 1.204434829637642, + "grad_norm": 2.3125, + "learning_rate": 0.01798230991704617, + "loss": 3.2808, + "mean_token_accuracy": 0.38201218843460083, + "num_tokens": 2277036114.0, + "step": 4454 + }, + { + "epoch": 1.2047052460789616, + "grad_norm": 2.921875, + "learning_rate": 0.01798131510219676, + "loss": 3.4139, + "mean_token_accuracy": 0.3661939799785614, + "num_tokens": 2277560379.0, + "step": 4455 + }, + { + "epoch": 1.2049756625202812, + "grad_norm": 2.859375, + "learning_rate": 0.017980320073140064, + "loss": 3.3668, + "mean_token_accuracy": 0.3696788251399994, + "num_tokens": 2278084546.0, + "step": 4456 + }, + { + "epoch": 1.2052460789616009, + "grad_norm": 2.953125, + "learning_rate": 0.017979324829906627, + "loss": 3.4111, + "mean_token_accuracy": 0.39307254552841187, + "num_tokens": 2278454365.0, + "step": 4457 + }, + { + "epoch": 1.2055164954029205, + "grad_norm": 2.4375, + "learning_rate": 0.017978329372526972, + "loss": 3.1021, + "mean_token_accuracy": 0.39354008436203003, + "num_tokens": 2278953122.0, + "step": 4458 + }, + { + "epoch": 1.2057869118442401, + "grad_norm": 2.640625, + "learning_rate": 0.017977333701031654, + "loss": 3.4602, + "mean_token_accuracy": 0.3724978566169739, + "num_tokens": 2279477359.0, + "step": 4459 + }, + { + "epoch": 1.2060573282855598, + "grad_norm": 2.796875, + "learning_rate": 0.017976337815451222, + "loss": 3.3593, + "mean_token_accuracy": 0.3706081509590149, + "num_tokens": 2280001548.0, + "step": 4460 + }, + { + "epoch": 1.2063277447268794, + "grad_norm": 3.5625, + "learning_rate": 0.017975341715816228, + "loss": 3.2812, + "mean_token_accuracy": 0.38008370995521545, + "num_tokens": 2280502044.0, + "step": 4461 + }, + { + "epoch": 1.206598161168199, + "grad_norm": 2.421875, + "learning_rate": 0.017974345402157234, + "loss": 3.3958, + "mean_token_accuracy": 0.38434529304504395, + "num_tokens": 2281022806.0, + "step": 4462 + }, + { + "epoch": 1.2068685776095187, + "grad_norm": 2.859375, + "learning_rate": 0.017973348874504816, + "loss": 3.3271, + "mean_token_accuracy": 0.3732663691043854, + "num_tokens": 2281547085.0, + "step": 4463 + }, + { + "epoch": 1.2071389940508384, + "grad_norm": 2.21875, + "learning_rate": 0.017972352132889544, + "loss": 3.1893, + "mean_token_accuracy": 0.3945770859718323, + "num_tokens": 2282071181.0, + "step": 4464 + }, + { + "epoch": 1.207409410492158, + "grad_norm": 2.609375, + "learning_rate": 0.017971355177342007, + "loss": 3.4172, + "mean_token_accuracy": 0.3965485692024231, + "num_tokens": 2282560317.0, + "step": 4465 + }, + { + "epoch": 1.2076798269334776, + "grad_norm": 3.5, + "learning_rate": 0.01797035800789279, + "loss": 3.3506, + "mean_token_accuracy": 0.3709408640861511, + "num_tokens": 2283084559.0, + "step": 4466 + }, + { + "epoch": 1.2079502433747973, + "grad_norm": 2.5, + "learning_rate": 0.01796936062457249, + "loss": 3.2304, + "mean_token_accuracy": 0.3697029948234558, + "num_tokens": 2283608814.0, + "step": 4467 + }, + { + "epoch": 1.2082206598161167, + "grad_norm": 2.609375, + "learning_rate": 0.017968363027411716, + "loss": 3.3147, + "mean_token_accuracy": 0.37568262219429016, + "num_tokens": 2284133012.0, + "step": 4468 + }, + { + "epoch": 1.2084910762574363, + "grad_norm": 2.53125, + "learning_rate": 0.01796736521644107, + "loss": 3.4313, + "mean_token_accuracy": 0.36832159757614136, + "num_tokens": 2284657187.0, + "step": 4469 + }, + { + "epoch": 1.208761492698756, + "grad_norm": 11.875, + "learning_rate": 0.017966367191691167, + "loss": 2.9874, + "mean_token_accuracy": 0.4053063988685608, + "num_tokens": 2285181372.0, + "step": 4470 + }, + { + "epoch": 1.2090319091400756, + "grad_norm": 29.375, + "learning_rate": 0.017965368953192628, + "loss": 10.219, + "mean_token_accuracy": 0.0047205910086631775, + "num_tokens": 2285654205.0, + "step": 4471 + }, + { + "epoch": 1.2093023255813953, + "grad_norm": 5.21875, + "learning_rate": 0.017964370500976094, + "loss": 3.7355, + "mean_token_accuracy": 0.3230869770050049, + "num_tokens": 2286178474.0, + "step": 4472 + }, + { + "epoch": 1.209572742022715, + "grad_norm": 3.6875, + "learning_rate": 0.017963371835072187, + "loss": 3.6058, + "mean_token_accuracy": 0.32364094257354736, + "num_tokens": 2286699067.0, + "step": 4473 + }, + { + "epoch": 1.2098431584640346, + "grad_norm": 4.09375, + "learning_rate": 0.017962372955511556, + "loss": 3.6607, + "mean_token_accuracy": 0.37137824296951294, + "num_tokens": 2287223340.0, + "step": 4474 + }, + { + "epoch": 1.2101135749053542, + "grad_norm": 3.421875, + "learning_rate": 0.017961373862324846, + "loss": 3.4865, + "mean_token_accuracy": 0.3490687608718872, + "num_tokens": 2287747511.0, + "step": 4475 + }, + { + "epoch": 1.2103839913466738, + "grad_norm": 2.796875, + "learning_rate": 0.017960374555542717, + "loss": 3.2874, + "mean_token_accuracy": 0.3699013590812683, + "num_tokens": 2288263977.0, + "step": 4476 + }, + { + "epoch": 1.2106544077879935, + "grad_norm": 3.046875, + "learning_rate": 0.01795937503519582, + "loss": 3.3324, + "mean_token_accuracy": 0.3670088052749634, + "num_tokens": 2288788115.0, + "step": 4477 + }, + { + "epoch": 1.2109248242293131, + "grad_norm": 2.59375, + "learning_rate": 0.017958375301314836, + "loss": 3.2107, + "mean_token_accuracy": 0.398895263671875, + "num_tokens": 2289268837.0, + "step": 4478 + }, + { + "epoch": 1.2111952406706328, + "grad_norm": 3.34375, + "learning_rate": 0.017957375353930438, + "loss": 3.3562, + "mean_token_accuracy": 0.3825376629829407, + "num_tokens": 2289741188.0, + "step": 4479 + }, + { + "epoch": 1.2114656571119524, + "grad_norm": 2.9375, + "learning_rate": 0.0179563751930733, + "loss": 3.3012, + "mean_token_accuracy": 0.3864918351173401, + "num_tokens": 2290256582.0, + "step": 4480 + }, + { + "epoch": 1.211736073553272, + "grad_norm": 4.0, + "learning_rate": 0.017955374818774114, + "loss": 3.5366, + "mean_token_accuracy": 0.3528618812561035, + "num_tokens": 2290780732.0, + "step": 4481 + }, + { + "epoch": 1.2120064899945917, + "grad_norm": 2.828125, + "learning_rate": 0.017954374231063577, + "loss": 3.4143, + "mean_token_accuracy": 0.40440523624420166, + "num_tokens": 2291240188.0, + "step": 4482 + }, + { + "epoch": 1.2122769064359114, + "grad_norm": 2.765625, + "learning_rate": 0.017953373429972382, + "loss": 3.54, + "mean_token_accuracy": 0.37109375, + "num_tokens": 2291764469.0, + "step": 4483 + }, + { + "epoch": 1.212547322877231, + "grad_norm": 2.953125, + "learning_rate": 0.017952372415531243, + "loss": 3.509, + "mean_token_accuracy": 0.36217206716537476, + "num_tokens": 2292288735.0, + "step": 4484 + }, + { + "epoch": 1.2128177393185506, + "grad_norm": 2.6875, + "learning_rate": 0.017951371187770876, + "loss": 3.3043, + "mean_token_accuracy": 0.39321041107177734, + "num_tokens": 2292777915.0, + "step": 4485 + }, + { + "epoch": 1.2130881557598703, + "grad_norm": 3.0, + "learning_rate": 0.017950369746721994, + "loss": 3.4344, + "mean_token_accuracy": 0.3549060821533203, + "num_tokens": 2293302168.0, + "step": 4486 + }, + { + "epoch": 1.21335857220119, + "grad_norm": 3.984375, + "learning_rate": 0.01794936809241533, + "loss": 3.027, + "mean_token_accuracy": 0.38715165853500366, + "num_tokens": 2293822803.0, + "step": 4487 + }, + { + "epoch": 1.2136289886425096, + "grad_norm": 2.6875, + "learning_rate": 0.017948366224881617, + "loss": 3.4723, + "mean_token_accuracy": 0.3626425862312317, + "num_tokens": 2294304332.0, + "step": 4488 + }, + { + "epoch": 1.2138994050838292, + "grad_norm": 2.796875, + "learning_rate": 0.017947364144151594, + "loss": 3.26, + "mean_token_accuracy": 0.37638211250305176, + "num_tokens": 2294828473.0, + "step": 4489 + }, + { + "epoch": 1.2141698215251489, + "grad_norm": 2.984375, + "learning_rate": 0.01794636185025601, + "loss": 3.4565, + "mean_token_accuracy": 0.3681298792362213, + "num_tokens": 2295352750.0, + "step": 4490 + }, + { + "epoch": 1.2144402379664683, + "grad_norm": 72.0, + "learning_rate": 0.017945359343225612, + "loss": 18.634, + "mean_token_accuracy": 0.0426635667681694, + "num_tokens": 2295876954.0, + "step": 4491 + }, + { + "epoch": 1.214710654407788, + "grad_norm": 7.78125, + "learning_rate": 0.01794435662309117, + "loss": 3.9773, + "mean_token_accuracy": 0.3123549818992615, + "num_tokens": 2296401237.0, + "step": 4492 + }, + { + "epoch": 1.2149810708491076, + "grad_norm": 2.421875, + "learning_rate": 0.01794335368988344, + "loss": 3.4981, + "mean_token_accuracy": 0.36417117714881897, + "num_tokens": 2296911544.0, + "step": 4493 + }, + { + "epoch": 1.2152514872904272, + "grad_norm": 2.21875, + "learning_rate": 0.017942350543633204, + "loss": 3.126, + "mean_token_accuracy": 0.39321398735046387, + "num_tokens": 2297435633.0, + "step": 4494 + }, + { + "epoch": 1.2155219037317468, + "grad_norm": 3.484375, + "learning_rate": 0.017941347184371233, + "loss": 3.3703, + "mean_token_accuracy": 0.3757411539554596, + "num_tokens": 2297959785.0, + "step": 4495 + }, + { + "epoch": 1.2157923201730665, + "grad_norm": 3.078125, + "learning_rate": 0.017940343612128318, + "loss": 3.2834, + "mean_token_accuracy": 0.3737441599369049, + "num_tokens": 2298484052.0, + "step": 4496 + }, + { + "epoch": 1.2160627366143861, + "grad_norm": 4.0625, + "learning_rate": 0.01793933982693525, + "loss": 3.0616, + "mean_token_accuracy": 0.4032834470272064, + "num_tokens": 2298967190.0, + "step": 4497 + }, + { + "epoch": 1.2163331530557058, + "grad_norm": 2.796875, + "learning_rate": 0.017938335828822836, + "loss": 3.2802, + "mean_token_accuracy": 0.38291215896606445, + "num_tokens": 2299453635.0, + "step": 4498 + }, + { + "epoch": 1.2166035694970254, + "grad_norm": 3.515625, + "learning_rate": 0.017937331617821867, + "loss": 3.2893, + "mean_token_accuracy": 0.3900487720966339, + "num_tokens": 2299929863.0, + "step": 4499 + }, + { + "epoch": 1.216873985938345, + "grad_norm": 3.09375, + "learning_rate": 0.017936327193963163, + "loss": 3.2373, + "mean_token_accuracy": 0.4067736864089966, + "num_tokens": 2300454045.0, + "step": 4500 + }, + { + "epoch": 1.2171444023796647, + "grad_norm": 2.546875, + "learning_rate": 0.017935322557277544, + "loss": 3.4012, + "mean_token_accuracy": 0.40499308705329895, + "num_tokens": 2300978327.0, + "step": 4501 + }, + { + "epoch": 1.2174148188209843, + "grad_norm": 3.265625, + "learning_rate": 0.017934317707795834, + "loss": 3.3524, + "mean_token_accuracy": 0.37295663356781006, + "num_tokens": 2301450656.0, + "step": 4502 + }, + { + "epoch": 1.217685235262304, + "grad_norm": 3.0, + "learning_rate": 0.017933312645548868, + "loss": 3.4078, + "mean_token_accuracy": 0.3782126009464264, + "num_tokens": 2301949119.0, + "step": 4503 + }, + { + "epoch": 1.2179556517036236, + "grad_norm": 3.015625, + "learning_rate": 0.01793230737056747, + "loss": 3.3553, + "mean_token_accuracy": 0.38699567317962646, + "num_tokens": 2302473299.0, + "step": 4504 + }, + { + "epoch": 1.2182260681449433, + "grad_norm": 2.3125, + "learning_rate": 0.017931301882882503, + "loss": 3.0232, + "mean_token_accuracy": 0.41337111592292786, + "num_tokens": 2302997555.0, + "step": 4505 + }, + { + "epoch": 1.218496484586263, + "grad_norm": 2.59375, + "learning_rate": 0.01793029618252481, + "loss": 3.4347, + "mean_token_accuracy": 0.38701000809669495, + "num_tokens": 2303501909.0, + "step": 4506 + }, + { + "epoch": 1.2187669010275826, + "grad_norm": 3.765625, + "learning_rate": 0.01792929026952525, + "loss": 3.3681, + "mean_token_accuracy": 0.3783678710460663, + "num_tokens": 2304018217.0, + "step": 4507 + }, + { + "epoch": 1.2190373174689022, + "grad_norm": 2.734375, + "learning_rate": 0.017928284143914685, + "loss": 3.2696, + "mean_token_accuracy": 0.3987641930580139, + "num_tokens": 2304514193.0, + "step": 4508 + }, + { + "epoch": 1.2193077339102216, + "grad_norm": 3.640625, + "learning_rate": 0.017927277805723986, + "loss": 3.3797, + "mean_token_accuracy": 0.36549612879753113, + "num_tokens": 2305038398.0, + "step": 4509 + }, + { + "epoch": 1.2195781503515413, + "grad_norm": 3.375, + "learning_rate": 0.017926271254984038, + "loss": 3.4414, + "mean_token_accuracy": 0.43020522594451904, + "num_tokens": 2305498178.0, + "step": 4510 + }, + { + "epoch": 1.219848566792861, + "grad_norm": 136.0, + "learning_rate": 0.017925264491725712, + "loss": 18.8138, + "mean_token_accuracy": 0.0003311979817226529, + "num_tokens": 2306022374.0, + "step": 4511 + }, + { + "epoch": 1.2201189832341806, + "grad_norm": 9.5, + "learning_rate": 0.01792425751597991, + "loss": 4.2655, + "mean_token_accuracy": 0.3191072344779968, + "num_tokens": 2306546535.0, + "step": 4512 + }, + { + "epoch": 1.2203893996755002, + "grad_norm": 3.578125, + "learning_rate": 0.017923250327777522, + "loss": 3.6936, + "mean_token_accuracy": 0.34909647703170776, + "num_tokens": 2307070758.0, + "step": 4513 + }, + { + "epoch": 1.2206598161168198, + "grad_norm": 2.53125, + "learning_rate": 0.017922242927149456, + "loss": 3.4011, + "mean_token_accuracy": 0.3736551105976105, + "num_tokens": 2307594987.0, + "step": 4514 + }, + { + "epoch": 1.2209302325581395, + "grad_norm": 3.046875, + "learning_rate": 0.01792123531412662, + "loss": 3.4972, + "mean_token_accuracy": 0.3612072765827179, + "num_tokens": 2308119237.0, + "step": 4515 + }, + { + "epoch": 1.2212006489994591, + "grad_norm": 3.171875, + "learning_rate": 0.017920227488739927, + "loss": 3.3791, + "mean_token_accuracy": 0.3782726526260376, + "num_tokens": 2308619995.0, + "step": 4516 + }, + { + "epoch": 1.2214710654407788, + "grad_norm": 3.109375, + "learning_rate": 0.017919219451020308, + "loss": 3.5224, + "mean_token_accuracy": 0.3564542829990387, + "num_tokens": 2309144162.0, + "step": 4517 + }, + { + "epoch": 1.2217414818820984, + "grad_norm": 2.6875, + "learning_rate": 0.01791821120099869, + "loss": 3.5329, + "mean_token_accuracy": 0.3512321710586548, + "num_tokens": 2309668439.0, + "step": 4518 + }, + { + "epoch": 1.222011898323418, + "grad_norm": 2.78125, + "learning_rate": 0.017917202738706, + "loss": 3.1081, + "mean_token_accuracy": 0.39150407910346985, + "num_tokens": 2310130253.0, + "step": 4519 + }, + { + "epoch": 1.2222823147647377, + "grad_norm": 2.46875, + "learning_rate": 0.01791619406417319, + "loss": 3.2726, + "mean_token_accuracy": 0.403956800699234, + "num_tokens": 2310653588.0, + "step": 4520 + }, + { + "epoch": 1.2225527312060573, + "grad_norm": 2.859375, + "learning_rate": 0.01791518517743121, + "loss": 3.4299, + "mean_token_accuracy": 0.35074320435523987, + "num_tokens": 2311122296.0, + "step": 4521 + }, + { + "epoch": 1.222823147647377, + "grad_norm": 2.859375, + "learning_rate": 0.01791417607851101, + "loss": 3.3099, + "mean_token_accuracy": 0.3869004249572754, + "num_tokens": 2311646442.0, + "step": 4522 + }, + { + "epoch": 1.2230935640886966, + "grad_norm": 2.921875, + "learning_rate": 0.017913166767443558, + "loss": 3.3496, + "mean_token_accuracy": 0.3610694408416748, + "num_tokens": 2312170665.0, + "step": 4523 + }, + { + "epoch": 1.2233639805300163, + "grad_norm": 3.640625, + "learning_rate": 0.017912157244259815, + "loss": 2.875, + "mean_token_accuracy": 0.4475415349006653, + "num_tokens": 2312604325.0, + "step": 4524 + }, + { + "epoch": 1.223634396971336, + "grad_norm": 3.046875, + "learning_rate": 0.017911147508990765, + "loss": 3.0282, + "mean_token_accuracy": 0.4007852077484131, + "num_tokens": 2313128487.0, + "step": 4525 + }, + { + "epoch": 1.2239048134126556, + "grad_norm": 3.015625, + "learning_rate": 0.017910137561667382, + "loss": 3.3571, + "mean_token_accuracy": 0.36765456199645996, + "num_tokens": 2313652754.0, + "step": 4526 + }, + { + "epoch": 1.2241752298539752, + "grad_norm": 4.125, + "learning_rate": 0.017909127402320656, + "loss": 3.637, + "mean_token_accuracy": 0.36896535754203796, + "num_tokens": 2314109690.0, + "step": 4527 + }, + { + "epoch": 1.2244456462952948, + "grad_norm": 2.640625, + "learning_rate": 0.017908117030981587, + "loss": 3.3886, + "mean_token_accuracy": 0.3788718581199646, + "num_tokens": 2314633874.0, + "step": 4528 + }, + { + "epoch": 1.2247160627366145, + "grad_norm": 3.640625, + "learning_rate": 0.01790710644768117, + "loss": 3.413, + "mean_token_accuracy": 0.3819783926010132, + "num_tokens": 2315158142.0, + "step": 4529 + }, + { + "epoch": 1.2249864791779341, + "grad_norm": 2.359375, + "learning_rate": 0.01790609565245041, + "loss": 3.0848, + "mean_token_accuracy": 0.4039933979511261, + "num_tokens": 2315605683.0, + "step": 4530 + }, + { + "epoch": 1.2252568956192538, + "grad_norm": 3.484375, + "learning_rate": 0.01790508464532033, + "loss": 10.9229, + "mean_token_accuracy": 0.0004041655920445919, + "num_tokens": 2316129854.0, + "step": 4531 + }, + { + "epoch": 1.2255273120605732, + "grad_norm": 6.71875, + "learning_rate": 0.01790407342632195, + "loss": 3.9258, + "mean_token_accuracy": 0.3243821859359741, + "num_tokens": 2316654127.0, + "step": 4532 + }, + { + "epoch": 1.2257977285018928, + "grad_norm": 2.8125, + "learning_rate": 0.01790306199548629, + "loss": 3.5903, + "mean_token_accuracy": 0.3578133285045624, + "num_tokens": 2317178359.0, + "step": 4533 + }, + { + "epoch": 1.2260681449432125, + "grad_norm": 2.4375, + "learning_rate": 0.01790205035284439, + "loss": 3.3468, + "mean_token_accuracy": 0.3656672239303589, + "num_tokens": 2317702509.0, + "step": 4534 + }, + { + "epoch": 1.2263385613845321, + "grad_norm": 3.125, + "learning_rate": 0.01790103849842728, + "loss": 3.6236, + "mean_token_accuracy": 0.3585403561592102, + "num_tokens": 2318226604.0, + "step": 4535 + }, + { + "epoch": 1.2266089778258518, + "grad_norm": 3.21875, + "learning_rate": 0.01790002643226602, + "loss": 3.3139, + "mean_token_accuracy": 0.3974410891532898, + "num_tokens": 2318714604.0, + "step": 4536 + }, + { + "epoch": 1.2268793942671714, + "grad_norm": 4.71875, + "learning_rate": 0.017899014154391657, + "loss": 3.5023, + "mean_token_accuracy": 0.34414708614349365, + "num_tokens": 2319238874.0, + "step": 4537 + }, + { + "epoch": 1.227149810708491, + "grad_norm": 2.71875, + "learning_rate": 0.017898001664835253, + "loss": 3.4016, + "mean_token_accuracy": 0.3704392910003662, + "num_tokens": 2319763144.0, + "step": 4538 + }, + { + "epoch": 1.2274202271498107, + "grad_norm": 4.1875, + "learning_rate": 0.01789698896362787, + "loss": 3.4404, + "mean_token_accuracy": 0.3796938359737396, + "num_tokens": 2320250641.0, + "step": 4539 + }, + { + "epoch": 1.2276906435911303, + "grad_norm": 1.84375, + "learning_rate": 0.017895976050800583, + "loss": 3.504, + "mean_token_accuracy": 0.36896130442619324, + "num_tokens": 2320774832.0, + "step": 4540 + }, + { + "epoch": 1.22796106003245, + "grad_norm": 2.84375, + "learning_rate": 0.01789496292638447, + "loss": 3.3053, + "mean_token_accuracy": 0.38924548029899597, + "num_tokens": 2321219890.0, + "step": 4541 + }, + { + "epoch": 1.2282314764737696, + "grad_norm": 2.03125, + "learning_rate": 0.01789394959041062, + "loss": 3.4881, + "mean_token_accuracy": 0.3545508086681366, + "num_tokens": 2321732781.0, + "step": 4542 + }, + { + "epoch": 1.2285018929150893, + "grad_norm": 3.015625, + "learning_rate": 0.01789293604291012, + "loss": 3.2526, + "mean_token_accuracy": 0.3800765872001648, + "num_tokens": 2322257020.0, + "step": 4543 + }, + { + "epoch": 1.228772309356409, + "grad_norm": 2.828125, + "learning_rate": 0.017891922283914074, + "loss": 3.5101, + "mean_token_accuracy": 0.3819601535797119, + "num_tokens": 2322753259.0, + "step": 4544 + }, + { + "epoch": 1.2290427257977286, + "grad_norm": 3.546875, + "learning_rate": 0.017890908313453583, + "loss": 3.3816, + "mean_token_accuracy": 0.3518717288970947, + "num_tokens": 2323277518.0, + "step": 4545 + }, + { + "epoch": 1.2293131422390482, + "grad_norm": 2.4375, + "learning_rate": 0.01788989413155976, + "loss": 3.3851, + "mean_token_accuracy": 0.37668466567993164, + "num_tokens": 2323801803.0, + "step": 4546 + }, + { + "epoch": 1.2295835586803678, + "grad_norm": 2.890625, + "learning_rate": 0.017888879738263724, + "loss": 3.3329, + "mean_token_accuracy": 0.3847091495990753, + "num_tokens": 2324299337.0, + "step": 4547 + }, + { + "epoch": 1.2298539751216875, + "grad_norm": 3.03125, + "learning_rate": 0.0178878651335966, + "loss": 3.3663, + "mean_token_accuracy": 0.38749560713768005, + "num_tokens": 2324823517.0, + "step": 4548 + }, + { + "epoch": 1.2301243915630071, + "grad_norm": 3.15625, + "learning_rate": 0.017886850317589516, + "loss": 3.2572, + "mean_token_accuracy": 0.38336730003356934, + "num_tokens": 2325347734.0, + "step": 4549 + }, + { + "epoch": 1.2303948080043265, + "grad_norm": 2.5, + "learning_rate": 0.01788583529027361, + "loss": 3.1976, + "mean_token_accuracy": 0.3917982578277588, + "num_tokens": 2325863461.0, + "step": 4550 + }, + { + "epoch": 1.2306652244456462, + "grad_norm": 5.65625, + "learning_rate": 0.01788482005168003, + "loss": 13.6971, + "mean_token_accuracy": 0.009172007441520691, + "num_tokens": 2326387581.0, + "step": 4551 + }, + { + "epoch": 1.2309356408869658, + "grad_norm": 7.21875, + "learning_rate": 0.017883804601839925, + "loss": 3.7779, + "mean_token_accuracy": 0.30953946709632874, + "num_tokens": 2326911822.0, + "step": 4552 + }, + { + "epoch": 1.2312060573282855, + "grad_norm": 2.578125, + "learning_rate": 0.017882788940784447, + "loss": 3.4343, + "mean_token_accuracy": 0.3645632266998291, + "num_tokens": 2327435992.0, + "step": 4553 + }, + { + "epoch": 1.2314764737696051, + "grad_norm": 2.984375, + "learning_rate": 0.017881773068544767, + "loss": 3.6046, + "mean_token_accuracy": 0.3489891588687897, + "num_tokens": 2327960225.0, + "step": 4554 + }, + { + "epoch": 1.2317468902109248, + "grad_norm": 2.546875, + "learning_rate": 0.017880756985152048, + "loss": 3.4306, + "mean_token_accuracy": 0.3809955418109894, + "num_tokens": 2328484321.0, + "step": 4555 + }, + { + "epoch": 1.2320173066522444, + "grad_norm": 2.59375, + "learning_rate": 0.01787974069063747, + "loss": 3.4339, + "mean_token_accuracy": 0.36119693517684937, + "num_tokens": 2329008551.0, + "step": 4556 + }, + { + "epoch": 1.232287723093564, + "grad_norm": 2.5, + "learning_rate": 0.017878724185032217, + "loss": 3.3673, + "mean_token_accuracy": 0.36515194177627563, + "num_tokens": 2329532608.0, + "step": 4557 + }, + { + "epoch": 1.2325581395348837, + "grad_norm": 2.796875, + "learning_rate": 0.01787770746836748, + "loss": 3.3278, + "mean_token_accuracy": 0.35866081714630127, + "num_tokens": 2330056682.0, + "step": 4558 + }, + { + "epoch": 1.2328285559762033, + "grad_norm": 3.28125, + "learning_rate": 0.017876690540674447, + "loss": 3.4617, + "mean_token_accuracy": 0.37402427196502686, + "num_tokens": 2330548448.0, + "step": 4559 + }, + { + "epoch": 1.233098972417523, + "grad_norm": 2.96875, + "learning_rate": 0.017875673401984326, + "loss": 3.3106, + "mean_token_accuracy": 0.3645862936973572, + "num_tokens": 2331072615.0, + "step": 4560 + }, + { + "epoch": 1.2333693888588426, + "grad_norm": 3.890625, + "learning_rate": 0.017874656052328325, + "loss": 3.3863, + "mean_token_accuracy": 0.3808874487876892, + "num_tokens": 2331596827.0, + "step": 4561 + }, + { + "epoch": 1.2336398053001623, + "grad_norm": 4.3125, + "learning_rate": 0.01787363849173766, + "loss": 3.502, + "mean_token_accuracy": 0.3720747232437134, + "num_tokens": 2332110135.0, + "step": 4562 + }, + { + "epoch": 1.233910221741482, + "grad_norm": 3.796875, + "learning_rate": 0.017872620720243555, + "loss": 3.457, + "mean_token_accuracy": 0.38033515214920044, + "num_tokens": 2332634408.0, + "step": 4563 + }, + { + "epoch": 1.2341806381828015, + "grad_norm": 3.234375, + "learning_rate": 0.017871602737877232, + "loss": 3.5124, + "mean_token_accuracy": 0.36976301670074463, + "num_tokens": 2333158641.0, + "step": 4564 + }, + { + "epoch": 1.2344510546241212, + "grad_norm": 3.71875, + "learning_rate": 0.01787058454466993, + "loss": 3.065, + "mean_token_accuracy": 0.3866274952888489, + "num_tokens": 2333682902.0, + "step": 4565 + }, + { + "epoch": 1.2347214710654408, + "grad_norm": 3.28125, + "learning_rate": 0.017869566140652885, + "loss": 3.4999, + "mean_token_accuracy": 0.3675838112831116, + "num_tokens": 2334207168.0, + "step": 4566 + }, + { + "epoch": 1.2349918875067605, + "grad_norm": 3.65625, + "learning_rate": 0.017868547525857353, + "loss": 3.5268, + "mean_token_accuracy": 0.36472445726394653, + "num_tokens": 2334731252.0, + "step": 4567 + }, + { + "epoch": 1.2352623039480801, + "grad_norm": 3.03125, + "learning_rate": 0.017867528700314585, + "loss": 3.2967, + "mean_token_accuracy": 0.378271222114563, + "num_tokens": 2335220256.0, + "step": 4568 + }, + { + "epoch": 1.2355327203893998, + "grad_norm": 2.828125, + "learning_rate": 0.017866509664055837, + "loss": 3.3418, + "mean_token_accuracy": 0.3742244243621826, + "num_tokens": 2335744532.0, + "step": 4569 + }, + { + "epoch": 1.2358031368307194, + "grad_norm": 3.03125, + "learning_rate": 0.017865490417112377, + "loss": 3.2874, + "mean_token_accuracy": 0.38629767298698425, + "num_tokens": 2336209282.0, + "step": 4570 + }, + { + "epoch": 1.236073553272039, + "grad_norm": 4.0625, + "learning_rate": 0.017864470959515485, + "loss": 11.6382, + "mean_token_accuracy": 2.3606222384842113e-05, + "num_tokens": 2336733412.0, + "step": 4571 + }, + { + "epoch": 1.2363439697133587, + "grad_norm": 6.09375, + "learning_rate": 0.017863451291296437, + "loss": 3.7658, + "mean_token_accuracy": 0.3266458511352539, + "num_tokens": 2337210665.0, + "step": 4572 + }, + { + "epoch": 1.2366143861546781, + "grad_norm": 2.125, + "learning_rate": 0.017862431412486522, + "loss": 3.678, + "mean_token_accuracy": 0.3545815944671631, + "num_tokens": 2337734821.0, + "step": 4573 + }, + { + "epoch": 1.2368848025959978, + "grad_norm": 2.8125, + "learning_rate": 0.017861411323117023, + "loss": 3.4216, + "mean_token_accuracy": 0.3699098825454712, + "num_tokens": 2338258989.0, + "step": 4574 + }, + { + "epoch": 1.2371552190373174, + "grad_norm": 3.578125, + "learning_rate": 0.017860391023219253, + "loss": 3.4496, + "mean_token_accuracy": 0.3727841377258301, + "num_tokens": 2338783263.0, + "step": 4575 + }, + { + "epoch": 1.237425635478637, + "grad_norm": 3.328125, + "learning_rate": 0.017859370512824508, + "loss": 3.3429, + "mean_token_accuracy": 0.3731541633605957, + "num_tokens": 2339307402.0, + "step": 4576 + }, + { + "epoch": 1.2376960519199567, + "grad_norm": 2.65625, + "learning_rate": 0.017858349791964104, + "loss": 3.1162, + "mean_token_accuracy": 0.38509491086006165, + "num_tokens": 2339831610.0, + "step": 4577 + }, + { + "epoch": 1.2379664683612763, + "grad_norm": 2.6875, + "learning_rate": 0.01785732886066936, + "loss": 3.4008, + "mean_token_accuracy": 0.37529173493385315, + "num_tokens": 2340355779.0, + "step": 4578 + }, + { + "epoch": 1.238236884802596, + "grad_norm": 2.765625, + "learning_rate": 0.0178563077189716, + "loss": 3.2959, + "mean_token_accuracy": 0.37489837408065796, + "num_tokens": 2340880045.0, + "step": 4579 + }, + { + "epoch": 1.2385073012439156, + "grad_norm": 3.015625, + "learning_rate": 0.017855286366902156, + "loss": 3.3527, + "mean_token_accuracy": 0.35805630683898926, + "num_tokens": 2341404176.0, + "step": 4580 + }, + { + "epoch": 1.2387777176852353, + "grad_norm": 2.59375, + "learning_rate": 0.017854264804492367, + "loss": 3.2637, + "mean_token_accuracy": 0.37189459800720215, + "num_tokens": 2341928278.0, + "step": 4581 + }, + { + "epoch": 1.239048134126555, + "grad_norm": 3.6875, + "learning_rate": 0.017853243031773575, + "loss": 3.4757, + "mean_token_accuracy": 0.3779982030391693, + "num_tokens": 2342452347.0, + "step": 4582 + }, + { + "epoch": 1.2393185505678745, + "grad_norm": 3.125, + "learning_rate": 0.017852221048777137, + "loss": 3.1255, + "mean_token_accuracy": 0.38604220747947693, + "num_tokens": 2342976584.0, + "step": 4583 + }, + { + "epoch": 1.2395889670091942, + "grad_norm": 2.765625, + "learning_rate": 0.0178511988555344, + "loss": 3.2827, + "mean_token_accuracy": 0.39283287525177, + "num_tokens": 2343500843.0, + "step": 4584 + }, + { + "epoch": 1.2398593834505138, + "grad_norm": 3.34375, + "learning_rate": 0.01785017645207674, + "loss": 3.4546, + "mean_token_accuracy": 0.36667245626449585, + "num_tokens": 2344025104.0, + "step": 4585 + }, + { + "epoch": 1.2401297998918335, + "grad_norm": 2.390625, + "learning_rate": 0.01784915383843552, + "loss": 3.1536, + "mean_token_accuracy": 0.38854730129241943, + "num_tokens": 2344538074.0, + "step": 4586 + }, + { + "epoch": 1.2404002163331531, + "grad_norm": 2.515625, + "learning_rate": 0.017848131014642114, + "loss": 3.5739, + "mean_token_accuracy": 0.35783421993255615, + "num_tokens": 2345062238.0, + "step": 4587 + }, + { + "epoch": 1.2406706327744728, + "grad_norm": 2.328125, + "learning_rate": 0.01784710798072791, + "loss": 3.2515, + "mean_token_accuracy": 0.38535863161087036, + "num_tokens": 2345586396.0, + "step": 4588 + }, + { + "epoch": 1.2409410492157924, + "grad_norm": 2.171875, + "learning_rate": 0.0178460847367243, + "loss": 3.1352, + "mean_token_accuracy": 0.38846027851104736, + "num_tokens": 2346066163.0, + "step": 4589 + }, + { + "epoch": 1.241211465657112, + "grad_norm": 2.828125, + "learning_rate": 0.017845061282662677, + "loss": 3.0543, + "mean_token_accuracy": 0.38432973623275757, + "num_tokens": 2346548564.0, + "step": 4590 + }, + { + "epoch": 1.2414818820984315, + "grad_norm": 2.3125, + "learning_rate": 0.017844037618574442, + "loss": 11.0456, + "mean_token_accuracy": 1.9156435882905498e-05, + "num_tokens": 2347072573.0, + "step": 4591 + }, + { + "epoch": 1.241752298539751, + "grad_norm": 6.03125, + "learning_rate": 0.01784301374449101, + "loss": 3.6448, + "mean_token_accuracy": 0.31856393814086914, + "num_tokens": 2347596854.0, + "step": 4592 + }, + { + "epoch": 1.2420227149810708, + "grad_norm": 2.625, + "learning_rate": 0.017841989660443786, + "loss": 3.2837, + "mean_token_accuracy": 0.391152948141098, + "num_tokens": 2348121123.0, + "step": 4593 + }, + { + "epoch": 1.2422931314223904, + "grad_norm": 3.671875, + "learning_rate": 0.017840965366464203, + "loss": 3.3148, + "mean_token_accuracy": 0.35442423820495605, + "num_tokens": 2348645315.0, + "step": 4594 + }, + { + "epoch": 1.24256354786371, + "grad_norm": 2.578125, + "learning_rate": 0.017839940862583684, + "loss": 3.3031, + "mean_token_accuracy": 0.3899274170398712, + "num_tokens": 2349169579.0, + "step": 4595 + }, + { + "epoch": 1.2428339643050297, + "grad_norm": 3.40625, + "learning_rate": 0.01783891614883366, + "loss": 3.2638, + "mean_token_accuracy": 0.36304518580436707, + "num_tokens": 2349693685.0, + "step": 4596 + }, + { + "epoch": 1.2431043807463493, + "grad_norm": 2.625, + "learning_rate": 0.01783789122524558, + "loss": 3.4197, + "mean_token_accuracy": 0.3610089123249054, + "num_tokens": 2350217962.0, + "step": 4597 + }, + { + "epoch": 1.243374797187669, + "grad_norm": 3.484375, + "learning_rate": 0.01783686609185089, + "loss": 3.4838, + "mean_token_accuracy": 0.3600338399410248, + "num_tokens": 2350742225.0, + "step": 4598 + }, + { + "epoch": 1.2436452136289886, + "grad_norm": 2.6875, + "learning_rate": 0.01783584074868104, + "loss": 3.2071, + "mean_token_accuracy": 0.372917503118515, + "num_tokens": 2351266352.0, + "step": 4599 + }, + { + "epoch": 1.2439156300703083, + "grad_norm": 2.671875, + "learning_rate": 0.017834815195767496, + "loss": 3.3373, + "mean_token_accuracy": 0.39372557401657104, + "num_tokens": 2351773670.0, + "step": 4600 + }, + { + "epoch": 1.244186046511628, + "grad_norm": 2.828125, + "learning_rate": 0.01783378943314172, + "loss": 3.2497, + "mean_token_accuracy": 0.3592039942741394, + "num_tokens": 2352297859.0, + "step": 4601 + }, + { + "epoch": 1.2444564629529475, + "grad_norm": 2.921875, + "learning_rate": 0.017832763460835187, + "loss": 3.4925, + "mean_token_accuracy": 0.387103408575058, + "num_tokens": 2352772439.0, + "step": 4602 + }, + { + "epoch": 1.2447268793942672, + "grad_norm": 3.546875, + "learning_rate": 0.017831737278879375, + "loss": 3.1177, + "mean_token_accuracy": 0.38688623905181885, + "num_tokens": 2353296537.0, + "step": 4603 + }, + { + "epoch": 1.2449972958355868, + "grad_norm": 4.03125, + "learning_rate": 0.017830710887305774, + "loss": 3.3209, + "mean_token_accuracy": 0.39318791031837463, + "num_tokens": 2353809418.0, + "step": 4604 + }, + { + "epoch": 1.2452677122769065, + "grad_norm": 4.40625, + "learning_rate": 0.017829684286145876, + "loss": 3.1755, + "mean_token_accuracy": 0.3656543493270874, + "num_tokens": 2354333632.0, + "step": 4605 + }, + { + "epoch": 1.2455381287182261, + "grad_norm": 2.9375, + "learning_rate": 0.017828657475431182, + "loss": 3.3744, + "mean_token_accuracy": 0.3904167413711548, + "num_tokens": 2354817037.0, + "step": 4606 + }, + { + "epoch": 1.2458085451595458, + "grad_norm": 3.171875, + "learning_rate": 0.017827630455193193, + "loss": 3.3917, + "mean_token_accuracy": 0.3722078204154968, + "num_tokens": 2355341315.0, + "step": 4607 + }, + { + "epoch": 1.2460789616008654, + "grad_norm": 2.40625, + "learning_rate": 0.01782660322546342, + "loss": 3.1609, + "mean_token_accuracy": 0.3960948586463928, + "num_tokens": 2355865516.0, + "step": 4608 + }, + { + "epoch": 1.246349378042185, + "grad_norm": 2.828125, + "learning_rate": 0.017825575786273386, + "loss": 3.341, + "mean_token_accuracy": 0.3781088590621948, + "num_tokens": 2356389713.0, + "step": 4609 + }, + { + "epoch": 1.2466197944835047, + "grad_norm": 2.515625, + "learning_rate": 0.017824548137654612, + "loss": 3.1121, + "mean_token_accuracy": 0.39060288667678833, + "num_tokens": 2356913960.0, + "step": 4610 + }, + { + "epoch": 1.2468902109248243, + "grad_norm": 103.5, + "learning_rate": 0.017823520279638634, + "loss": 12.2701, + "mean_token_accuracy": 0.0026921106036752462, + "num_tokens": 2357438243.0, + "step": 4611 + }, + { + "epoch": 1.247160627366144, + "grad_norm": 7.71875, + "learning_rate": 0.01782249221225699, + "loss": 3.6419, + "mean_token_accuracy": 0.3248879611492157, + "num_tokens": 2357962458.0, + "step": 4612 + }, + { + "epoch": 1.2474310438074636, + "grad_norm": 2.546875, + "learning_rate": 0.017821463935541217, + "loss": 3.1864, + "mean_token_accuracy": 0.35301291942596436, + "num_tokens": 2358486474.0, + "step": 4613 + }, + { + "epoch": 1.247701460248783, + "grad_norm": 3.09375, + "learning_rate": 0.017820435449522873, + "loss": 3.6328, + "mean_token_accuracy": 0.35021066665649414, + "num_tokens": 2359010716.0, + "step": 4614 + }, + { + "epoch": 1.2479718766901027, + "grad_norm": 3.71875, + "learning_rate": 0.01781940675423351, + "loss": 3.5749, + "mean_token_accuracy": 0.3608403503894806, + "num_tokens": 2359534982.0, + "step": 4615 + }, + { + "epoch": 1.2482422931314223, + "grad_norm": 2.671875, + "learning_rate": 0.01781837784970469, + "loss": 3.3081, + "mean_token_accuracy": 0.34426960349082947, + "num_tokens": 2360059188.0, + "step": 4616 + }, + { + "epoch": 1.248512709572742, + "grad_norm": 3.078125, + "learning_rate": 0.01781734873596799, + "loss": 3.5144, + "mean_token_accuracy": 0.3605707883834839, + "num_tokens": 2360583400.0, + "step": 4617 + }, + { + "epoch": 1.2487831260140616, + "grad_norm": 7.21875, + "learning_rate": 0.017816319413054983, + "loss": 3.3469, + "mean_token_accuracy": 0.3813353478908539, + "num_tokens": 2361107599.0, + "step": 4618 + }, + { + "epoch": 1.2490535424553812, + "grad_norm": 1.765625, + "learning_rate": 0.01781528988099725, + "loss": 3.5662, + "mean_token_accuracy": 0.3434162735939026, + "num_tokens": 2361631872.0, + "step": 4619 + }, + { + "epoch": 1.249323958896701, + "grad_norm": 3.0, + "learning_rate": 0.017814260139826378, + "loss": 3.3667, + "mean_token_accuracy": 0.3685417175292969, + "num_tokens": 2362155962.0, + "step": 4620 + }, + { + "epoch": 1.2495943753380205, + "grad_norm": 3.8125, + "learning_rate": 0.017813230189573967, + "loss": 3.3273, + "mean_token_accuracy": 0.3598954379558563, + "num_tokens": 2362680133.0, + "step": 4621 + }, + { + "epoch": 1.2498647917793402, + "grad_norm": 2.765625, + "learning_rate": 0.01781220003027162, + "loss": 3.4045, + "mean_token_accuracy": 0.38796865940093994, + "num_tokens": 2363204236.0, + "step": 4622 + }, + { + "epoch": 1.2501352082206598, + "grad_norm": 3.609375, + "learning_rate": 0.01781116966195094, + "loss": 3.5722, + "mean_token_accuracy": 0.3417690396308899, + "num_tokens": 2363728517.0, + "step": 4623 + }, + { + "epoch": 1.2504056246619795, + "grad_norm": 2.828125, + "learning_rate": 0.017810139084643545, + "loss": 3.3537, + "mean_token_accuracy": 0.3969626724720001, + "num_tokens": 2364250734.0, + "step": 4624 + }, + { + "epoch": 1.250676041103299, + "grad_norm": 3.140625, + "learning_rate": 0.017809108298381057, + "loss": 3.2623, + "mean_token_accuracy": 0.3686820864677429, + "num_tokens": 2364742272.0, + "step": 4625 + }, + { + "epoch": 1.2509464575446188, + "grad_norm": 2.890625, + "learning_rate": 0.0178080773031951, + "loss": 3.304, + "mean_token_accuracy": 0.4233378767967224, + "num_tokens": 2365201221.0, + "step": 4626 + }, + { + "epoch": 1.2512168739859384, + "grad_norm": 3.546875, + "learning_rate": 0.01780704609911731, + "loss": 3.2777, + "mean_token_accuracy": 0.3775704503059387, + "num_tokens": 2365725282.0, + "step": 4627 + }, + { + "epoch": 1.251487290427258, + "grad_norm": 2.75, + "learning_rate": 0.01780601468617933, + "loss": 3.4633, + "mean_token_accuracy": 0.3761501610279083, + "num_tokens": 2366228614.0, + "step": 4628 + }, + { + "epoch": 1.2517577068685777, + "grad_norm": 4.15625, + "learning_rate": 0.0178049830644128, + "loss": 3.3116, + "mean_token_accuracy": 0.3697386682033539, + "num_tokens": 2366752811.0, + "step": 4629 + }, + { + "epoch": 1.2520281233098973, + "grad_norm": 2.90625, + "learning_rate": 0.01780395123384938, + "loss": 3.0976, + "mean_token_accuracy": 0.39320942759513855, + "num_tokens": 2367276914.0, + "step": 4630 + }, + { + "epoch": 1.2522985397512167, + "grad_norm": 13.375, + "learning_rate": 0.01780291919452073, + "loss": 10.1736, + "mean_token_accuracy": 0.0002748964470811188, + "num_tokens": 2367755954.0, + "step": 4631 + }, + { + "epoch": 1.2525689561925364, + "grad_norm": 5.75, + "learning_rate": 0.017801886946458508, + "loss": 3.8783, + "mean_token_accuracy": 0.3280773162841797, + "num_tokens": 2368280164.0, + "step": 4632 + }, + { + "epoch": 1.252839372633856, + "grad_norm": 2.671875, + "learning_rate": 0.017800854489694398, + "loss": 3.1346, + "mean_token_accuracy": 0.39749473333358765, + "num_tokens": 2368804411.0, + "step": 4633 + }, + { + "epoch": 1.2531097890751757, + "grad_norm": 2.75, + "learning_rate": 0.017799821824260067, + "loss": 3.4224, + "mean_token_accuracy": 0.36580443382263184, + "num_tokens": 2369328590.0, + "step": 4634 + }, + { + "epoch": 1.2533802055164953, + "grad_norm": 2.65625, + "learning_rate": 0.017798788950187203, + "loss": 3.3864, + "mean_token_accuracy": 0.3815577030181885, + "num_tokens": 2369852750.0, + "step": 4635 + }, + { + "epoch": 1.253650621957815, + "grad_norm": 2.890625, + "learning_rate": 0.017797755867507505, + "loss": 3.5494, + "mean_token_accuracy": 0.3523223102092743, + "num_tokens": 2370376925.0, + "step": 4636 + }, + { + "epoch": 1.2539210383991346, + "grad_norm": 2.921875, + "learning_rate": 0.017796722576252668, + "loss": 3.0237, + "mean_token_accuracy": 0.3815240263938904, + "num_tokens": 2370901111.0, + "step": 4637 + }, + { + "epoch": 1.2541914548404542, + "grad_norm": 2.953125, + "learning_rate": 0.017795689076454388, + "loss": 3.3492, + "mean_token_accuracy": 0.3810010552406311, + "num_tokens": 2371365562.0, + "step": 4638 + }, + { + "epoch": 1.2544618712817739, + "grad_norm": 3.390625, + "learning_rate": 0.017794655368144386, + "loss": 3.3933, + "mean_token_accuracy": 0.37548303604125977, + "num_tokens": 2371889816.0, + "step": 4639 + }, + { + "epoch": 1.2547322877230935, + "grad_norm": 12.0625, + "learning_rate": 0.017793621451354377, + "loss": 3.251, + "mean_token_accuracy": 0.35539913177490234, + "num_tokens": 2372413945.0, + "step": 4640 + }, + { + "epoch": 1.2550027041644132, + "grad_norm": 2.78125, + "learning_rate": 0.017792587326116077, + "loss": 3.5042, + "mean_token_accuracy": 0.3559306859970093, + "num_tokens": 2372830322.0, + "step": 4641 + }, + { + "epoch": 1.2552731206057328, + "grad_norm": 2.546875, + "learning_rate": 0.01779155299246123, + "loss": 3.2643, + "mean_token_accuracy": 0.345203697681427, + "num_tokens": 2373354482.0, + "step": 4642 + }, + { + "epoch": 1.2555435370470525, + "grad_norm": 3.453125, + "learning_rate": 0.01779051845042156, + "loss": 3.2138, + "mean_token_accuracy": 0.3855210840702057, + "num_tokens": 2373878664.0, + "step": 4643 + }, + { + "epoch": 1.255813953488372, + "grad_norm": 2.875, + "learning_rate": 0.017789483700028814, + "loss": 3.2033, + "mean_token_accuracy": 0.3996700644493103, + "num_tokens": 2374402932.0, + "step": 4644 + }, + { + "epoch": 1.2560843699296917, + "grad_norm": 3.109375, + "learning_rate": 0.01778844874131474, + "loss": 3.35, + "mean_token_accuracy": 0.3793284595012665, + "num_tokens": 2374927190.0, + "step": 4645 + }, + { + "epoch": 1.2563547863710114, + "grad_norm": 4.03125, + "learning_rate": 0.0177874135743111, + "loss": 3.4053, + "mean_token_accuracy": 0.3713715076446533, + "num_tokens": 2375451289.0, + "step": 4646 + }, + { + "epoch": 1.256625202812331, + "grad_norm": 3.140625, + "learning_rate": 0.01778637819904965, + "loss": 3.3304, + "mean_token_accuracy": 0.39068761467933655, + "num_tokens": 2375959534.0, + "step": 4647 + }, + { + "epoch": 1.2568956192536507, + "grad_norm": 3.265625, + "learning_rate": 0.017785342615562156, + "loss": 3.2342, + "mean_token_accuracy": 0.40160176157951355, + "num_tokens": 2376483784.0, + "step": 4648 + }, + { + "epoch": 1.2571660356949703, + "grad_norm": 2.59375, + "learning_rate": 0.017784306823880398, + "loss": 3.3331, + "mean_token_accuracy": 0.37914496660232544, + "num_tokens": 2377007970.0, + "step": 4649 + }, + { + "epoch": 1.25743645213629, + "grad_norm": 3.546875, + "learning_rate": 0.01778327082403616, + "loss": 3.3263, + "mean_token_accuracy": 0.3726055324077606, + "num_tokens": 2377507268.0, + "step": 4650 + }, + { + "epoch": 1.2577068685776096, + "grad_norm": 52.25, + "learning_rate": 0.017782234616061217, + "loss": 14.7297, + "mean_token_accuracy": 4.5237673475639895e-05, + "num_tokens": 2377967850.0, + "step": 4651 + }, + { + "epoch": 1.2579772850189292, + "grad_norm": 5.8125, + "learning_rate": 0.017781198199987375, + "loss": 3.5698, + "mean_token_accuracy": 0.34940847754478455, + "num_tokens": 2378491990.0, + "step": 4652 + }, + { + "epoch": 1.258247701460249, + "grad_norm": 2.78125, + "learning_rate": 0.01778016157584643, + "loss": 3.4042, + "mean_token_accuracy": 0.37995657324790955, + "num_tokens": 2379008198.0, + "step": 4653 + }, + { + "epoch": 1.2585181179015685, + "grad_norm": 3.25, + "learning_rate": 0.017779124743670185, + "loss": 3.401, + "mean_token_accuracy": 0.36969631910324097, + "num_tokens": 2379497566.0, + "step": 4654 + }, + { + "epoch": 1.2587885343428882, + "grad_norm": 2.65625, + "learning_rate": 0.01777808770349046, + "loss": 3.5003, + "mean_token_accuracy": 0.37747806310653687, + "num_tokens": 2380021838.0, + "step": 4655 + }, + { + "epoch": 1.2590589507842076, + "grad_norm": 2.71875, + "learning_rate": 0.01777705045533907, + "loss": 3.3017, + "mean_token_accuracy": 0.3808504343032837, + "num_tokens": 2380542193.0, + "step": 4656 + }, + { + "epoch": 1.2593293672255272, + "grad_norm": 3.25, + "learning_rate": 0.017776012999247845, + "loss": 3.3046, + "mean_token_accuracy": 0.411271870136261, + "num_tokens": 2381066244.0, + "step": 4657 + }, + { + "epoch": 1.2595997836668469, + "grad_norm": 3.59375, + "learning_rate": 0.017774975335248612, + "loss": 3.5289, + "mean_token_accuracy": 0.3698303699493408, + "num_tokens": 2381590513.0, + "step": 4658 + }, + { + "epoch": 1.2598702001081665, + "grad_norm": 2.4375, + "learning_rate": 0.01777393746337321, + "loss": 3.1783, + "mean_token_accuracy": 0.36964643001556396, + "num_tokens": 2382114782.0, + "step": 4659 + }, + { + "epoch": 1.2601406165494862, + "grad_norm": 3.671875, + "learning_rate": 0.01777289938365349, + "loss": 3.0602, + "mean_token_accuracy": 0.3707866668701172, + "num_tokens": 2382639002.0, + "step": 4660 + }, + { + "epoch": 1.2604110329908058, + "grad_norm": 2.546875, + "learning_rate": 0.017771861096121295, + "loss": 3.1819, + "mean_token_accuracy": 0.3776363432407379, + "num_tokens": 2383158903.0, + "step": 4661 + }, + { + "epoch": 1.2606814494321255, + "grad_norm": 3.609375, + "learning_rate": 0.01777082260080849, + "loss": 3.4527, + "mean_token_accuracy": 0.36646974086761475, + "num_tokens": 2383683042.0, + "step": 4662 + }, + { + "epoch": 1.260951865873445, + "grad_norm": 4.03125, + "learning_rate": 0.017769783897746933, + "loss": 3.6503, + "mean_token_accuracy": 0.35280942916870117, + "num_tokens": 2384207221.0, + "step": 4663 + }, + { + "epoch": 1.2612222823147647, + "grad_norm": 2.765625, + "learning_rate": 0.0177687449869685, + "loss": 3.4482, + "mean_token_accuracy": 0.3672533631324768, + "num_tokens": 2384731303.0, + "step": 4664 + }, + { + "epoch": 1.2614926987560844, + "grad_norm": 3.234375, + "learning_rate": 0.01776770586850506, + "loss": 3.3504, + "mean_token_accuracy": 0.3660133481025696, + "num_tokens": 2385255471.0, + "step": 4665 + }, + { + "epoch": 1.261763115197404, + "grad_norm": 2.03125, + "learning_rate": 0.017766666542388506, + "loss": 2.9306, + "mean_token_accuracy": 0.41768208146095276, + "num_tokens": 2385779678.0, + "step": 4666 + }, + { + "epoch": 1.2620335316387237, + "grad_norm": 3.46875, + "learning_rate": 0.017765627008650725, + "loss": 3.466, + "mean_token_accuracy": 0.3711543381214142, + "num_tokens": 2386299861.0, + "step": 4667 + }, + { + "epoch": 1.2623039480800433, + "grad_norm": 3.640625, + "learning_rate": 0.01776458726732361, + "loss": 3.6154, + "mean_token_accuracy": 0.3668053150177002, + "num_tokens": 2386824132.0, + "step": 4668 + }, + { + "epoch": 1.262574364521363, + "grad_norm": 3.71875, + "learning_rate": 0.01776354731843906, + "loss": 3.4538, + "mean_token_accuracy": 0.35826942324638367, + "num_tokens": 2387348311.0, + "step": 4669 + }, + { + "epoch": 1.2628447809626826, + "grad_norm": 3.03125, + "learning_rate": 0.01776250716202899, + "loss": 3.358, + "mean_token_accuracy": 0.38218873739242554, + "num_tokens": 2387872453.0, + "step": 4670 + }, + { + "epoch": 1.2631151974040022, + "grad_norm": 14.5625, + "learning_rate": 0.017761466798125316, + "loss": 11.8369, + "mean_token_accuracy": 0.010218216106295586, + "num_tokens": 2388348159.0, + "step": 4671 + }, + { + "epoch": 1.2633856138453217, + "grad_norm": 10.25, + "learning_rate": 0.017760426226759954, + "loss": 3.9281, + "mean_token_accuracy": 0.29184895753860474, + "num_tokens": 2388872412.0, + "step": 4672 + }, + { + "epoch": 1.2636560302866413, + "grad_norm": 2.71875, + "learning_rate": 0.017759385447964834, + "loss": 3.3857, + "mean_token_accuracy": 0.3381437063217163, + "num_tokens": 2389396611.0, + "step": 4673 + }, + { + "epoch": 1.263926446727961, + "grad_norm": 2.28125, + "learning_rate": 0.01775834446177189, + "loss": 3.2108, + "mean_token_accuracy": 0.375221312046051, + "num_tokens": 2389920890.0, + "step": 4674 + }, + { + "epoch": 1.2641968631692806, + "grad_norm": 2.71875, + "learning_rate": 0.017757303268213065, + "loss": 3.1708, + "mean_token_accuracy": 0.36350181698799133, + "num_tokens": 2390445127.0, + "step": 4675 + }, + { + "epoch": 1.2644672796106002, + "grad_norm": 2.75, + "learning_rate": 0.017756261867320305, + "loss": 3.4678, + "mean_token_accuracy": 0.38050830364227295, + "num_tokens": 2390969334.0, + "step": 4676 + }, + { + "epoch": 1.2647376960519199, + "grad_norm": 3.6875, + "learning_rate": 0.017755220259125556, + "loss": 3.4748, + "mean_token_accuracy": 0.37829160690307617, + "num_tokens": 2391463478.0, + "step": 4677 + }, + { + "epoch": 1.2650081124932395, + "grad_norm": 2.65625, + "learning_rate": 0.01775417844366079, + "loss": 3.3868, + "mean_token_accuracy": 0.37785613536834717, + "num_tokens": 2391987659.0, + "step": 4678 + }, + { + "epoch": 1.2652785289345592, + "grad_norm": 3.03125, + "learning_rate": 0.017753136420957962, + "loss": 3.4175, + "mean_token_accuracy": 0.3563380539417267, + "num_tokens": 2392511837.0, + "step": 4679 + }, + { + "epoch": 1.2655489453758788, + "grad_norm": 2.71875, + "learning_rate": 0.01775209419104905, + "loss": 3.5454, + "mean_token_accuracy": 0.3743492364883423, + "num_tokens": 2393036009.0, + "step": 4680 + }, + { + "epoch": 1.2658193618171985, + "grad_norm": 2.671875, + "learning_rate": 0.017751051753966036, + "loss": 3.2432, + "mean_token_accuracy": 0.3685746192932129, + "num_tokens": 2393542650.0, + "step": 4681 + }, + { + "epoch": 1.266089778258518, + "grad_norm": 3.40625, + "learning_rate": 0.017750009109740894, + "loss": 3.3823, + "mean_token_accuracy": 0.3786834478378296, + "num_tokens": 2394017425.0, + "step": 4682 + }, + { + "epoch": 1.2663601946998377, + "grad_norm": 2.71875, + "learning_rate": 0.017748966258405628, + "loss": 3.1763, + "mean_token_accuracy": 0.3857734501361847, + "num_tokens": 2394541547.0, + "step": 4683 + }, + { + "epoch": 1.2666306111411574, + "grad_norm": 3.359375, + "learning_rate": 0.017747923199992226, + "loss": 3.2103, + "mean_token_accuracy": 0.40128034353256226, + "num_tokens": 2395065632.0, + "step": 4684 + }, + { + "epoch": 1.266901027582477, + "grad_norm": 4.15625, + "learning_rate": 0.017746879934532694, + "loss": 3.221, + "mean_token_accuracy": 0.35459452867507935, + "num_tokens": 2395530559.0, + "step": 4685 + }, + { + "epoch": 1.2671714440237967, + "grad_norm": 2.3125, + "learning_rate": 0.017745836462059048, + "loss": 3.4657, + "mean_token_accuracy": 0.3679569363594055, + "num_tokens": 2396054799.0, + "step": 4686 + }, + { + "epoch": 1.2674418604651163, + "grad_norm": 3.125, + "learning_rate": 0.017744792782603304, + "loss": 3.3387, + "mean_token_accuracy": 0.38098400831222534, + "num_tokens": 2396578996.0, + "step": 4687 + }, + { + "epoch": 1.267712276906436, + "grad_norm": 2.734375, + "learning_rate": 0.017743748896197474, + "loss": 3.342, + "mean_token_accuracy": 0.3575868010520935, + "num_tokens": 2397103267.0, + "step": 4688 + }, + { + "epoch": 1.2679826933477556, + "grad_norm": 3.25, + "learning_rate": 0.017742704802873602, + "loss": 3.1925, + "mean_token_accuracy": 0.3969029188156128, + "num_tokens": 2397618188.0, + "step": 4689 + }, + { + "epoch": 1.2682531097890752, + "grad_norm": 2.515625, + "learning_rate": 0.017741660502663716, + "loss": 3.1912, + "mean_token_accuracy": 0.38569533824920654, + "num_tokens": 2398142383.0, + "step": 4690 + }, + { + "epoch": 1.2685235262303949, + "grad_norm": 95.5, + "learning_rate": 0.01774061599559986, + "loss": 14.5474, + "mean_token_accuracy": 0.006051556207239628, + "num_tokens": 2398666662.0, + "step": 4691 + }, + { + "epoch": 1.2687939426717145, + "grad_norm": 7.59375, + "learning_rate": 0.017739571281714082, + "loss": 4.0136, + "mean_token_accuracy": 0.31032851338386536, + "num_tokens": 2399190846.0, + "step": 4692 + }, + { + "epoch": 1.2690643591130342, + "grad_norm": 2.078125, + "learning_rate": 0.017738526361038436, + "loss": 3.5145, + "mean_token_accuracy": 0.3543620705604553, + "num_tokens": 2399714938.0, + "step": 4693 + }, + { + "epoch": 1.2693347755543538, + "grad_norm": 2.1875, + "learning_rate": 0.017737481233604985, + "loss": 3.3491, + "mean_token_accuracy": 0.37471699714660645, + "num_tokens": 2400239203.0, + "step": 4694 + }, + { + "epoch": 1.2696051919956735, + "grad_norm": 2.6875, + "learning_rate": 0.0177364358994458, + "loss": 3.394, + "mean_token_accuracy": 0.37889736890792847, + "num_tokens": 2400763434.0, + "step": 4695 + }, + { + "epoch": 1.269875608436993, + "grad_norm": 2.46875, + "learning_rate": 0.017735390358592947, + "loss": 3.3532, + "mean_token_accuracy": 0.36820369958877563, + "num_tokens": 2401287713.0, + "step": 4696 + }, + { + "epoch": 1.2701460248783125, + "grad_norm": 2.609375, + "learning_rate": 0.01773434461107851, + "loss": 3.3884, + "mean_token_accuracy": 0.37254464626312256, + "num_tokens": 2401811934.0, + "step": 4697 + }, + { + "epoch": 1.2704164413196322, + "grad_norm": 3.546875, + "learning_rate": 0.017733298656934578, + "loss": 3.5484, + "mean_token_accuracy": 0.35237643122673035, + "num_tokens": 2402336216.0, + "step": 4698 + }, + { + "epoch": 1.2706868577609518, + "grad_norm": 3.0, + "learning_rate": 0.017732252496193243, + "loss": 3.3646, + "mean_token_accuracy": 0.3800662159919739, + "num_tokens": 2402843279.0, + "step": 4699 + }, + { + "epoch": 1.2709572742022714, + "grad_norm": 2.78125, + "learning_rate": 0.0177312061288866, + "loss": 3.2271, + "mean_token_accuracy": 0.39181727170944214, + "num_tokens": 2403367410.0, + "step": 4700 + }, + { + "epoch": 1.271227690643591, + "grad_norm": 2.796875, + "learning_rate": 0.01773015955504676, + "loss": 3.4634, + "mean_token_accuracy": 0.3721146881580353, + "num_tokens": 2403891682.0, + "step": 4701 + }, + { + "epoch": 1.2714981070849107, + "grad_norm": 2.828125, + "learning_rate": 0.017729112774705832, + "loss": 3.1845, + "mean_token_accuracy": 0.37743079662323, + "num_tokens": 2404415961.0, + "step": 4702 + }, + { + "epoch": 1.2717685235262304, + "grad_norm": 2.78125, + "learning_rate": 0.017728065787895934, + "loss": 3.3983, + "mean_token_accuracy": 0.3695813715457916, + "num_tokens": 2404940227.0, + "step": 4703 + }, + { + "epoch": 1.27203893996755, + "grad_norm": 2.390625, + "learning_rate": 0.017727018594649195, + "loss": 3.216, + "mean_token_accuracy": 0.40690159797668457, + "num_tokens": 2405464353.0, + "step": 4704 + }, + { + "epoch": 1.2723093564088697, + "grad_norm": 2.859375, + "learning_rate": 0.017725971194997737, + "loss": 3.2293, + "mean_token_accuracy": 0.3919450640678406, + "num_tokens": 2405929688.0, + "step": 4705 + }, + { + "epoch": 1.2725797728501893, + "grad_norm": 3.265625, + "learning_rate": 0.017724923588973707, + "loss": 3.3456, + "mean_token_accuracy": 0.3719520568847656, + "num_tokens": 2406453908.0, + "step": 4706 + }, + { + "epoch": 1.272850189291509, + "grad_norm": 3.34375, + "learning_rate": 0.017723875776609246, + "loss": 3.1292, + "mean_token_accuracy": 0.37512242794036865, + "num_tokens": 2406978134.0, + "step": 4707 + }, + { + "epoch": 1.2731206057328286, + "grad_norm": 3.0625, + "learning_rate": 0.0177228277579365, + "loss": 3.4762, + "mean_token_accuracy": 0.361843466758728, + "num_tokens": 2407502407.0, + "step": 4708 + }, + { + "epoch": 1.2733910221741482, + "grad_norm": 3.1875, + "learning_rate": 0.017721779532987626, + "loss": 3.5487, + "mean_token_accuracy": 0.347700297832489, + "num_tokens": 2408026624.0, + "step": 4709 + }, + { + "epoch": 1.2736614386154679, + "grad_norm": 3.203125, + "learning_rate": 0.01772073110179479, + "loss": 3.4452, + "mean_token_accuracy": 0.37242650985717773, + "num_tokens": 2408550874.0, + "step": 4710 + }, + { + "epoch": 1.2739318550567875, + "grad_norm": 164.0, + "learning_rate": 0.017719682464390162, + "loss": 16.459, + "mean_token_accuracy": 5.655968561768532e-05, + "num_tokens": 2409075054.0, + "step": 4711 + }, + { + "epoch": 1.2742022714981072, + "grad_norm": 11.375, + "learning_rate": 0.01771863362080591, + "loss": 3.8145, + "mean_token_accuracy": 0.3669207692146301, + "num_tokens": 2409599322.0, + "step": 4712 + }, + { + "epoch": 1.2744726879394266, + "grad_norm": 4.25, + "learning_rate": 0.017717584571074224, + "loss": 3.4217, + "mean_token_accuracy": 0.3692580461502075, + "num_tokens": 2410123522.0, + "step": 4713 + }, + { + "epoch": 1.2747431043807462, + "grad_norm": 4.625, + "learning_rate": 0.017716535315227287, + "loss": 3.5445, + "mean_token_accuracy": 0.34342408180236816, + "num_tokens": 2410647770.0, + "step": 4714 + }, + { + "epoch": 1.2750135208220659, + "grad_norm": 2.875, + "learning_rate": 0.017715485853297294, + "loss": 3.4519, + "mean_token_accuracy": 0.3742038607597351, + "num_tokens": 2411171934.0, + "step": 4715 + }, + { + "epoch": 1.2752839372633855, + "grad_norm": 3.40625, + "learning_rate": 0.017714436185316446, + "loss": 3.2232, + "mean_token_accuracy": 0.3712535798549652, + "num_tokens": 2411696195.0, + "step": 4716 + }, + { + "epoch": 1.2755543537047052, + "grad_norm": 2.796875, + "learning_rate": 0.017713386311316952, + "loss": 3.2638, + "mean_token_accuracy": 0.3790404498577118, + "num_tokens": 2412220403.0, + "step": 4717 + }, + { + "epoch": 1.2758247701460248, + "grad_norm": 4.625, + "learning_rate": 0.017712336231331022, + "loss": 3.5626, + "mean_token_accuracy": 0.35785728693008423, + "num_tokens": 2412713682.0, + "step": 4718 + }, + { + "epoch": 1.2760951865873444, + "grad_norm": 3.296875, + "learning_rate": 0.017711285945390877, + "loss": 3.4052, + "mean_token_accuracy": 0.3816491365432739, + "num_tokens": 2413237942.0, + "step": 4719 + }, + { + "epoch": 1.276365603028664, + "grad_norm": 2.90625, + "learning_rate": 0.017710235453528744, + "loss": 3.5415, + "mean_token_accuracy": 0.3401382267475128, + "num_tokens": 2413762195.0, + "step": 4720 + }, + { + "epoch": 1.2766360194699837, + "grad_norm": 2.9375, + "learning_rate": 0.017709184755776847, + "loss": 3.3764, + "mean_token_accuracy": 0.40215936303138733, + "num_tokens": 2414248833.0, + "step": 4721 + }, + { + "epoch": 1.2769064359113034, + "grad_norm": 3.078125, + "learning_rate": 0.017708133852167438, + "loss": 3.4413, + "mean_token_accuracy": 0.38844841718673706, + "num_tokens": 2414708185.0, + "step": 4722 + }, + { + "epoch": 1.277176852352623, + "grad_norm": 2.78125, + "learning_rate": 0.017707082742732756, + "loss": 3.186, + "mean_token_accuracy": 0.41755926609039307, + "num_tokens": 2415205244.0, + "step": 4723 + }, + { + "epoch": 1.2774472687939427, + "grad_norm": 2.671875, + "learning_rate": 0.017706031427505046, + "loss": 3.2423, + "mean_token_accuracy": 0.3905887007713318, + "num_tokens": 2415721647.0, + "step": 4724 + }, + { + "epoch": 1.2777176852352623, + "grad_norm": 3.890625, + "learning_rate": 0.017704979906516576, + "loss": 3.2793, + "mean_token_accuracy": 0.389255166053772, + "num_tokens": 2416245810.0, + "step": 4725 + }, + { + "epoch": 1.277988101676582, + "grad_norm": 3.140625, + "learning_rate": 0.0177039281797996, + "loss": 3.3355, + "mean_token_accuracy": 0.3859241008758545, + "num_tokens": 2416750394.0, + "step": 4726 + }, + { + "epoch": 1.2782585181179016, + "grad_norm": 2.859375, + "learning_rate": 0.017702876247386395, + "loss": 3.2399, + "mean_token_accuracy": 0.36509624123573303, + "num_tokens": 2417274498.0, + "step": 4727 + }, + { + "epoch": 1.2785289345592212, + "grad_norm": 2.375, + "learning_rate": 0.017701824109309233, + "loss": 3.1222, + "mean_token_accuracy": 0.41509753465652466, + "num_tokens": 2417798641.0, + "step": 4728 + }, + { + "epoch": 1.2787993510005409, + "grad_norm": 2.21875, + "learning_rate": 0.0177007717656004, + "loss": 3.4137, + "mean_token_accuracy": 0.3618502914905548, + "num_tokens": 2418322908.0, + "step": 4729 + }, + { + "epoch": 1.2790697674418605, + "grad_norm": 2.875, + "learning_rate": 0.017699719216292185, + "loss": 3.3005, + "mean_token_accuracy": 0.3795287311077118, + "num_tokens": 2418815983.0, + "step": 4730 + }, + { + "epoch": 1.2793401838831802, + "grad_norm": 8.1875, + "learning_rate": 0.017698666461416875, + "loss": 9.9485, + "mean_token_accuracy": 0.0007994850166141987, + "num_tokens": 2419340126.0, + "step": 4731 + }, + { + "epoch": 1.2796106003244998, + "grad_norm": 7.0, + "learning_rate": 0.017697613501006786, + "loss": 3.9856, + "mean_token_accuracy": 0.3008764386177063, + "num_tokens": 2419864327.0, + "step": 4732 + }, + { + "epoch": 1.2798810167658194, + "grad_norm": 2.359375, + "learning_rate": 0.017696560335094213, + "loss": 3.4407, + "mean_token_accuracy": 0.35335955023765564, + "num_tokens": 2420388575.0, + "step": 4733 + }, + { + "epoch": 1.280151433207139, + "grad_norm": 2.5, + "learning_rate": 0.017695506963711482, + "loss": 3.187, + "mean_token_accuracy": 0.38005149364471436, + "num_tokens": 2420912736.0, + "step": 4734 + }, + { + "epoch": 1.2804218496484587, + "grad_norm": 2.1875, + "learning_rate": 0.017694453386890904, + "loss": 3.3954, + "mean_token_accuracy": 0.36598628759384155, + "num_tokens": 2421437007.0, + "step": 4735 + }, + { + "epoch": 1.2806922660897784, + "grad_norm": 2.578125, + "learning_rate": 0.01769339960466481, + "loss": 3.1883, + "mean_token_accuracy": 0.38227665424346924, + "num_tokens": 2421961003.0, + "step": 4736 + }, + { + "epoch": 1.280962682531098, + "grad_norm": 3.703125, + "learning_rate": 0.017692345617065525, + "loss": 3.2887, + "mean_token_accuracy": 0.3923302888870239, + "num_tokens": 2422471872.0, + "step": 4737 + }, + { + "epoch": 1.2812330989724177, + "grad_norm": 3.15625, + "learning_rate": 0.017691291424125403, + "loss": 3.3359, + "mean_token_accuracy": 0.3902493715286255, + "num_tokens": 2422935404.0, + "step": 4738 + }, + { + "epoch": 1.281503515413737, + "grad_norm": 3.640625, + "learning_rate": 0.01769023702587678, + "loss": 3.4872, + "mean_token_accuracy": 0.37138891220092773, + "num_tokens": 2423459672.0, + "step": 4739 + }, + { + "epoch": 1.2817739318550567, + "grad_norm": 3.5625, + "learning_rate": 0.017689182422352012, + "loss": 3.456, + "mean_token_accuracy": 0.3552725911140442, + "num_tokens": 2423983807.0, + "step": 4740 + }, + { + "epoch": 1.2820443482963764, + "grad_norm": 2.71875, + "learning_rate": 0.017688127613583453, + "loss": 3.282, + "mean_token_accuracy": 0.3775820732116699, + "num_tokens": 2424468508.0, + "step": 4741 + }, + { + "epoch": 1.282314764737696, + "grad_norm": 2.78125, + "learning_rate": 0.017687072599603473, + "loss": 3.3402, + "mean_token_accuracy": 0.37084200978279114, + "num_tokens": 2424992781.0, + "step": 4742 + }, + { + "epoch": 1.2825851811790157, + "grad_norm": 3.265625, + "learning_rate": 0.01768601738044444, + "loss": 3.3313, + "mean_token_accuracy": 0.37159067392349243, + "num_tokens": 2425512460.0, + "step": 4743 + }, + { + "epoch": 1.2828555976203353, + "grad_norm": 3.09375, + "learning_rate": 0.01768496195613873, + "loss": 3.423, + "mean_token_accuracy": 0.3757029175758362, + "num_tokens": 2426036703.0, + "step": 4744 + }, + { + "epoch": 1.283126014061655, + "grad_norm": 3.375, + "learning_rate": 0.017683906326718728, + "loss": 3.4934, + "mean_token_accuracy": 0.3641560971736908, + "num_tokens": 2426560981.0, + "step": 4745 + }, + { + "epoch": 1.2833964305029746, + "grad_norm": 3.015625, + "learning_rate": 0.017682850492216823, + "loss": 3.0604, + "mean_token_accuracy": 0.40136536955833435, + "num_tokens": 2427076263.0, + "step": 4746 + }, + { + "epoch": 1.2836668469442942, + "grad_norm": 3.140625, + "learning_rate": 0.017681794452665416, + "loss": 3.5338, + "mean_token_accuracy": 0.351033478975296, + "num_tokens": 2427600467.0, + "step": 4747 + }, + { + "epoch": 1.2839372633856139, + "grad_norm": 3.28125, + "learning_rate": 0.017680738208096902, + "loss": 3.4691, + "mean_token_accuracy": 0.3689361810684204, + "num_tokens": 2428124668.0, + "step": 4748 + }, + { + "epoch": 1.2842076798269335, + "grad_norm": 3.359375, + "learning_rate": 0.017679681758543692, + "loss": 3.2362, + "mean_token_accuracy": 0.3758728802204132, + "num_tokens": 2428648888.0, + "step": 4749 + }, + { + "epoch": 1.2844780962682532, + "grad_norm": 2.984375, + "learning_rate": 0.017678625104038205, + "loss": 3.4545, + "mean_token_accuracy": 0.3868294358253479, + "num_tokens": 2429152506.0, + "step": 4750 + }, + { + "epoch": 1.2847485127095728, + "grad_norm": 118.0, + "learning_rate": 0.01767756824461286, + "loss": 13.7927, + "mean_token_accuracy": 0.0011162121081724763, + "num_tokens": 2429623583.0, + "step": 4751 + }, + { + "epoch": 1.2850189291508924, + "grad_norm": 6.375, + "learning_rate": 0.017676511180300083, + "loss": 3.797, + "mean_token_accuracy": 0.2915298342704773, + "num_tokens": 2430147811.0, + "step": 4752 + }, + { + "epoch": 1.285289345592212, + "grad_norm": 2.78125, + "learning_rate": 0.017675453911132306, + "loss": 3.6605, + "mean_token_accuracy": 0.34085899591445923, + "num_tokens": 2430632960.0, + "step": 4753 + }, + { + "epoch": 1.2855597620335315, + "grad_norm": 2.546875, + "learning_rate": 0.017674396437141974, + "loss": 3.3937, + "mean_token_accuracy": 0.3788352608680725, + "num_tokens": 2431157139.0, + "step": 4754 + }, + { + "epoch": 1.2858301784748511, + "grad_norm": 2.703125, + "learning_rate": 0.017673338758361527, + "loss": 3.186, + "mean_token_accuracy": 0.3796616792678833, + "num_tokens": 2431681182.0, + "step": 4755 + }, + { + "epoch": 1.2861005949161708, + "grad_norm": 4.125, + "learning_rate": 0.017672280874823424, + "loss": 3.3539, + "mean_token_accuracy": 0.368461549282074, + "num_tokens": 2432205406.0, + "step": 4756 + }, + { + "epoch": 1.2863710113574904, + "grad_norm": 2.515625, + "learning_rate": 0.017671222786560123, + "loss": 3.3611, + "mean_token_accuracy": 0.3820977509021759, + "num_tokens": 2432729679.0, + "step": 4757 + }, + { + "epoch": 1.28664142779881, + "grad_norm": 3.640625, + "learning_rate": 0.01767016449360409, + "loss": 3.3761, + "mean_token_accuracy": 0.37941789627075195, + "num_tokens": 2433210304.0, + "step": 4758 + }, + { + "epoch": 1.2869118442401297, + "grad_norm": 2.703125, + "learning_rate": 0.017669105995987793, + "loss": 3.2816, + "mean_token_accuracy": 0.3810175359249115, + "num_tokens": 2433734553.0, + "step": 4759 + }, + { + "epoch": 1.2871822606814494, + "grad_norm": 3.0625, + "learning_rate": 0.01766804729374371, + "loss": 3.4204, + "mean_token_accuracy": 0.3566412329673767, + "num_tokens": 2434258808.0, + "step": 4760 + }, + { + "epoch": 1.287452677122769, + "grad_norm": 2.234375, + "learning_rate": 0.01766698838690433, + "loss": 3.338, + "mean_token_accuracy": 0.3692152202129364, + "num_tokens": 2434783072.0, + "step": 4761 + }, + { + "epoch": 1.2877230935640886, + "grad_norm": 3.4375, + "learning_rate": 0.017665929275502135, + "loss": 3.409, + "mean_token_accuracy": 0.36827194690704346, + "num_tokens": 2435284353.0, + "step": 4762 + }, + { + "epoch": 1.2879935100054083, + "grad_norm": 2.5, + "learning_rate": 0.017664869959569632, + "loss": 3.2783, + "mean_token_accuracy": 0.3773239850997925, + "num_tokens": 2435808598.0, + "step": 4763 + }, + { + "epoch": 1.288263926446728, + "grad_norm": 3.359375, + "learning_rate": 0.017663810439139317, + "loss": 3.3167, + "mean_token_accuracy": 0.3709487318992615, + "num_tokens": 2436332812.0, + "step": 4764 + }, + { + "epoch": 1.2885343428880476, + "grad_norm": 2.234375, + "learning_rate": 0.017662750714243697, + "loss": 3.0527, + "mean_token_accuracy": 0.398845911026001, + "num_tokens": 2436857021.0, + "step": 4765 + }, + { + "epoch": 1.2888047593293672, + "grad_norm": 2.640625, + "learning_rate": 0.017661690784915293, + "loss": 3.3039, + "mean_token_accuracy": 0.3787122368812561, + "num_tokens": 2437381308.0, + "step": 4766 + }, + { + "epoch": 1.2890751757706869, + "grad_norm": 2.734375, + "learning_rate": 0.017660630651186626, + "loss": 3.3417, + "mean_token_accuracy": 0.39544677734375, + "num_tokens": 2437842279.0, + "step": 4767 + }, + { + "epoch": 1.2893455922120065, + "grad_norm": 2.375, + "learning_rate": 0.017659570313090227, + "loss": 3.1989, + "mean_token_accuracy": 0.3906788229942322, + "num_tokens": 2438366375.0, + "step": 4768 + }, + { + "epoch": 1.2896160086533262, + "grad_norm": 2.5625, + "learning_rate": 0.01765850977065862, + "loss": 3.4095, + "mean_token_accuracy": 0.3744187355041504, + "num_tokens": 2438890643.0, + "step": 4769 + }, + { + "epoch": 1.2898864250946458, + "grad_norm": 2.4375, + "learning_rate": 0.01765744902392435, + "loss": 3.1876, + "mean_token_accuracy": 0.3889918923377991, + "num_tokens": 2439371687.0, + "step": 4770 + }, + { + "epoch": 1.2901568415359654, + "grad_norm": 21.5, + "learning_rate": 0.01765638807291997, + "loss": 10.8192, + "mean_token_accuracy": 9.596292511560023e-05, + "num_tokens": 2439895895.0, + "step": 4771 + }, + { + "epoch": 1.290427257977285, + "grad_norm": 7.5625, + "learning_rate": 0.01765532691767803, + "loss": 4.0468, + "mean_token_accuracy": 0.2929217517375946, + "num_tokens": 2440420100.0, + "step": 4772 + }, + { + "epoch": 1.2906976744186047, + "grad_norm": 2.34375, + "learning_rate": 0.01765426555823109, + "loss": 3.4542, + "mean_token_accuracy": 0.3619033098220825, + "num_tokens": 2440944208.0, + "step": 4773 + }, + { + "epoch": 1.2909680908599244, + "grad_norm": 2.484375, + "learning_rate": 0.017653203994611705, + "loss": 3.4104, + "mean_token_accuracy": 0.40223371982574463, + "num_tokens": 2441404880.0, + "step": 4774 + }, + { + "epoch": 1.291238507301244, + "grad_norm": 3.0625, + "learning_rate": 0.017652142226852463, + "loss": 3.4672, + "mean_token_accuracy": 0.3886803388595581, + "num_tokens": 2441901541.0, + "step": 4775 + }, + { + "epoch": 1.2915089237425637, + "grad_norm": 2.296875, + "learning_rate": 0.01765108025498593, + "loss": 3.1431, + "mean_token_accuracy": 0.39457473158836365, + "num_tokens": 2442406665.0, + "step": 4776 + }, + { + "epoch": 1.2917793401838833, + "grad_norm": 2.890625, + "learning_rate": 0.0176500180790447, + "loss": 3.1627, + "mean_token_accuracy": 0.4002954065799713, + "num_tokens": 2442930881.0, + "step": 4777 + }, + { + "epoch": 1.292049756625203, + "grad_norm": 2.921875, + "learning_rate": 0.01764895569906136, + "loss": 3.5799, + "mean_token_accuracy": 0.37838214635849, + "num_tokens": 2443396935.0, + "step": 4778 + }, + { + "epoch": 1.2923201730665226, + "grad_norm": 2.921875, + "learning_rate": 0.0176478931150685, + "loss": 3.4531, + "mean_token_accuracy": 0.3762020468711853, + "num_tokens": 2443921134.0, + "step": 4779 + }, + { + "epoch": 1.292590589507842, + "grad_norm": 3.71875, + "learning_rate": 0.017646830327098732, + "loss": 3.1088, + "mean_token_accuracy": 0.3704168200492859, + "num_tokens": 2444445280.0, + "step": 4780 + }, + { + "epoch": 1.2928610059491616, + "grad_norm": 2.703125, + "learning_rate": 0.01764576733518466, + "loss": 3.2758, + "mean_token_accuracy": 0.3534415364265442, + "num_tokens": 2444929805.0, + "step": 4781 + }, + { + "epoch": 1.2931314223904813, + "grad_norm": 2.828125, + "learning_rate": 0.017644704139358904, + "loss": 3.0506, + "mean_token_accuracy": 0.38274696469306946, + "num_tokens": 2445453857.0, + "step": 4782 + }, + { + "epoch": 1.293401838831801, + "grad_norm": 3.03125, + "learning_rate": 0.017643640739654087, + "loss": 3.335, + "mean_token_accuracy": 0.4193522334098816, + "num_tokens": 2445913837.0, + "step": 4783 + }, + { + "epoch": 1.2936722552731206, + "grad_norm": 3.53125, + "learning_rate": 0.017642577136102833, + "loss": 3.4312, + "mean_token_accuracy": 0.36886391043663025, + "num_tokens": 2446438021.0, + "step": 4784 + }, + { + "epoch": 1.2939426717144402, + "grad_norm": 2.90625, + "learning_rate": 0.017641513328737774, + "loss": 3.3616, + "mean_token_accuracy": 0.3883898854255676, + "num_tokens": 2446928906.0, + "step": 4785 + }, + { + "epoch": 1.2942130881557599, + "grad_norm": 3.59375, + "learning_rate": 0.01764044931759156, + "loss": 3.4672, + "mean_token_accuracy": 0.3601076900959015, + "num_tokens": 2447452885.0, + "step": 4786 + }, + { + "epoch": 1.2944835045970795, + "grad_norm": 3.6875, + "learning_rate": 0.01763938510269683, + "loss": 3.4663, + "mean_token_accuracy": 0.37056681513786316, + "num_tokens": 2447977084.0, + "step": 4787 + }, + { + "epoch": 1.2947539210383991, + "grad_norm": 3.5, + "learning_rate": 0.01763832068408624, + "loss": 3.3137, + "mean_token_accuracy": 0.35445544123649597, + "num_tokens": 2448501236.0, + "step": 4788 + }, + { + "epoch": 1.2950243374797188, + "grad_norm": 2.515625, + "learning_rate": 0.017637256061792447, + "loss": 3.3587, + "mean_token_accuracy": 0.36156177520751953, + "num_tokens": 2449025398.0, + "step": 4789 + }, + { + "epoch": 1.2952947539210384, + "grad_norm": 4.0, + "learning_rate": 0.017636191235848125, + "loss": 3.3329, + "mean_token_accuracy": 0.3881979286670685, + "num_tokens": 2449549659.0, + "step": 4790 + }, + { + "epoch": 1.295565170362358, + "grad_norm": 13.625, + "learning_rate": 0.017635126206285937, + "loss": 12.4158, + "mean_token_accuracy": 0.009875187650322914, + "num_tokens": 2450073760.0, + "step": 4791 + }, + { + "epoch": 1.2958355868036777, + "grad_norm": 9.375, + "learning_rate": 0.017634060973138563, + "loss": 3.7737, + "mean_token_accuracy": 0.35308802127838135, + "num_tokens": 2450535155.0, + "step": 4792 + }, + { + "epoch": 1.2961060032449974, + "grad_norm": 3.65625, + "learning_rate": 0.01763299553643869, + "loss": 3.5665, + "mean_token_accuracy": 0.377924382686615, + "num_tokens": 2451004330.0, + "step": 4793 + }, + { + "epoch": 1.296376419686317, + "grad_norm": 3.078125, + "learning_rate": 0.017631929896219006, + "loss": 3.5352, + "mean_token_accuracy": 0.3721469044685364, + "num_tokens": 2451528489.0, + "step": 4794 + }, + { + "epoch": 1.2966468361276364, + "grad_norm": 3.734375, + "learning_rate": 0.017630864052512213, + "loss": 3.6032, + "mean_token_accuracy": 0.3427492380142212, + "num_tokens": 2452052657.0, + "step": 4795 + }, + { + "epoch": 1.296917252568956, + "grad_norm": 2.34375, + "learning_rate": 0.01762979800535101, + "loss": 3.2523, + "mean_token_accuracy": 0.37135371565818787, + "num_tokens": 2452576934.0, + "step": 4796 + }, + { + "epoch": 1.2971876690102757, + "grad_norm": 2.234375, + "learning_rate": 0.017628731754768104, + "loss": 3.3621, + "mean_token_accuracy": 0.37095171213150024, + "num_tokens": 2453101113.0, + "step": 4797 + }, + { + "epoch": 1.2974580854515954, + "grad_norm": 2.734375, + "learning_rate": 0.017627665300796215, + "loss": 3.1347, + "mean_token_accuracy": 0.3958855867385864, + "num_tokens": 2453615753.0, + "step": 4798 + }, + { + "epoch": 1.297728501892915, + "grad_norm": 3.265625, + "learning_rate": 0.017626598643468067, + "loss": 3.129, + "mean_token_accuracy": 0.3920806646347046, + "num_tokens": 2454139989.0, + "step": 4799 + }, + { + "epoch": 1.2979989183342346, + "grad_norm": 3.953125, + "learning_rate": 0.01762553178281638, + "loss": 3.4023, + "mean_token_accuracy": 0.3661075830459595, + "num_tokens": 2454664242.0, + "step": 4800 + }, + { + "epoch": 1.2982693347755543, + "grad_norm": 3.53125, + "learning_rate": 0.017624464718873897, + "loss": 3.3697, + "mean_token_accuracy": 0.3770357668399811, + "num_tokens": 2455188426.0, + "step": 4801 + }, + { + "epoch": 1.298539751216874, + "grad_norm": 3.53125, + "learning_rate": 0.017623397451673357, + "loss": 3.2122, + "mean_token_accuracy": 0.4177309274673462, + "num_tokens": 2455712645.0, + "step": 4802 + }, + { + "epoch": 1.2988101676581936, + "grad_norm": 2.0, + "learning_rate": 0.017622329981247506, + "loss": 3.2694, + "mean_token_accuracy": 0.3786895275115967, + "num_tokens": 2456236922.0, + "step": 4803 + }, + { + "epoch": 1.2990805840995132, + "grad_norm": 2.703125, + "learning_rate": 0.017621262307629088, + "loss": 3.3216, + "mean_token_accuracy": 0.3952764868736267, + "num_tokens": 2456742389.0, + "step": 4804 + }, + { + "epoch": 1.2993510005408329, + "grad_norm": 2.984375, + "learning_rate": 0.017620194430850878, + "loss": 3.4382, + "mean_token_accuracy": 0.380868136882782, + "num_tokens": 2457266449.0, + "step": 4805 + }, + { + "epoch": 1.2996214169821525, + "grad_norm": 4.0, + "learning_rate": 0.017619126350945633, + "loss": 3.4174, + "mean_token_accuracy": 0.3732258081436157, + "num_tokens": 2457790713.0, + "step": 4806 + }, + { + "epoch": 1.2998918334234721, + "grad_norm": 3.21875, + "learning_rate": 0.017618058067946123, + "loss": 3.4175, + "mean_token_accuracy": 0.37256839871406555, + "num_tokens": 2458314840.0, + "step": 4807 + }, + { + "epoch": 1.3001622498647918, + "grad_norm": 3.609375, + "learning_rate": 0.01761698958188513, + "loss": 3.3994, + "mean_token_accuracy": 0.3743944764137268, + "num_tokens": 2458783096.0, + "step": 4808 + }, + { + "epoch": 1.3004326663061114, + "grad_norm": 2.859375, + "learning_rate": 0.01761592089279544, + "loss": 3.2953, + "mean_token_accuracy": 0.38741934299468994, + "num_tokens": 2459307164.0, + "step": 4809 + }, + { + "epoch": 1.300703082747431, + "grad_norm": 3.53125, + "learning_rate": 0.017614852000709835, + "loss": 3.4353, + "mean_token_accuracy": 0.36187803745269775, + "num_tokens": 2459831308.0, + "step": 4810 + }, + { + "epoch": 1.3009734991887507, + "grad_norm": 3.90625, + "learning_rate": 0.01761378290566112, + "loss": 11.8283, + "mean_token_accuracy": 0.00212042685598135, + "num_tokens": 2460355480.0, + "step": 4811 + }, + { + "epoch": 1.3012439156300704, + "grad_norm": 6.625, + "learning_rate": 0.017612713607682098, + "loss": 3.6528, + "mean_token_accuracy": 0.3550785779953003, + "num_tokens": 2460879722.0, + "step": 4812 + }, + { + "epoch": 1.30151433207139, + "grad_norm": 2.6875, + "learning_rate": 0.017611644106805577, + "loss": 3.4347, + "mean_token_accuracy": 0.3750458359718323, + "num_tokens": 2461392742.0, + "step": 4813 + }, + { + "epoch": 1.3017847485127096, + "grad_norm": 3.21875, + "learning_rate": 0.017610574403064367, + "loss": 3.2227, + "mean_token_accuracy": 0.3633389472961426, + "num_tokens": 2461916966.0, + "step": 4814 + }, + { + "epoch": 1.3020551649540293, + "grad_norm": 2.46875, + "learning_rate": 0.017609504496491294, + "loss": 3.3323, + "mean_token_accuracy": 0.3920022249221802, + "num_tokens": 2462398188.0, + "step": 4815 + }, + { + "epoch": 1.302325581395349, + "grad_norm": 2.78125, + "learning_rate": 0.01760843438711919, + "loss": 3.3528, + "mean_token_accuracy": 0.3912423253059387, + "num_tokens": 2462911253.0, + "step": 4816 + }, + { + "epoch": 1.3025959978366686, + "grad_norm": 3.265625, + "learning_rate": 0.017607364074980883, + "loss": 3.5105, + "mean_token_accuracy": 0.37471073865890503, + "num_tokens": 2463435462.0, + "step": 4817 + }, + { + "epoch": 1.3028664142779882, + "grad_norm": 3.0625, + "learning_rate": 0.017606293560109215, + "loss": 3.2918, + "mean_token_accuracy": 0.37415051460266113, + "num_tokens": 2463934710.0, + "step": 4818 + }, + { + "epoch": 1.3031368307193079, + "grad_norm": 3.0, + "learning_rate": 0.017605222842537033, + "loss": 3.4082, + "mean_token_accuracy": 0.37000852823257446, + "num_tokens": 2464448368.0, + "step": 4819 + }, + { + "epoch": 1.3034072471606275, + "grad_norm": 3.390625, + "learning_rate": 0.01760415192229719, + "loss": 3.4151, + "mean_token_accuracy": 0.3916895389556885, + "num_tokens": 2464972534.0, + "step": 4820 + }, + { + "epoch": 1.303677663601947, + "grad_norm": 4.53125, + "learning_rate": 0.017603080799422542, + "loss": 3.5428, + "mean_token_accuracy": 0.3512817919254303, + "num_tokens": 2465496725.0, + "step": 4821 + }, + { + "epoch": 1.3039480800432666, + "grad_norm": 3.515625, + "learning_rate": 0.017602009473945957, + "loss": 3.5169, + "mean_token_accuracy": 0.38303059339523315, + "num_tokens": 2465982770.0, + "step": 4822 + }, + { + "epoch": 1.3042184964845862, + "grad_norm": 3.375, + "learning_rate": 0.01760093794590031, + "loss": 3.3167, + "mean_token_accuracy": 0.3702869415283203, + "num_tokens": 2466490781.0, + "step": 4823 + }, + { + "epoch": 1.3044889129259059, + "grad_norm": 2.484375, + "learning_rate": 0.017599866215318467, + "loss": 3.3242, + "mean_token_accuracy": 0.38158857822418213, + "num_tokens": 2467000286.0, + "step": 4824 + }, + { + "epoch": 1.3047593293672255, + "grad_norm": 2.765625, + "learning_rate": 0.017598794282233324, + "loss": 3.4139, + "mean_token_accuracy": 0.37868741154670715, + "num_tokens": 2467524437.0, + "step": 4825 + }, + { + "epoch": 1.3050297458085451, + "grad_norm": 2.734375, + "learning_rate": 0.01759772214667777, + "loss": 3.243, + "mean_token_accuracy": 0.37674903869628906, + "num_tokens": 2468048629.0, + "step": 4826 + }, + { + "epoch": 1.3053001622498648, + "grad_norm": 2.984375, + "learning_rate": 0.017596649808684695, + "loss": 3.2755, + "mean_token_accuracy": 0.3893921375274658, + "num_tokens": 2468568581.0, + "step": 4827 + }, + { + "epoch": 1.3055705786911844, + "grad_norm": 2.828125, + "learning_rate": 0.017595577268287005, + "loss": 3.2824, + "mean_token_accuracy": 0.37902939319610596, + "num_tokens": 2469092839.0, + "step": 4828 + }, + { + "epoch": 1.305840995132504, + "grad_norm": 2.703125, + "learning_rate": 0.017594504525517606, + "loss": 3.4362, + "mean_token_accuracy": 0.3633686900138855, + "num_tokens": 2469611692.0, + "step": 4829 + }, + { + "epoch": 1.3061114115738237, + "grad_norm": 3.71875, + "learning_rate": 0.017593431580409418, + "loss": 3.0984, + "mean_token_accuracy": 0.372963547706604, + "num_tokens": 2470135826.0, + "step": 4830 + }, + { + "epoch": 1.3063818280151434, + "grad_norm": 62.0, + "learning_rate": 0.017592358432995357, + "loss": 11.5232, + "mean_token_accuracy": 0.010715823620557785, + "num_tokens": 2470659885.0, + "step": 4831 + }, + { + "epoch": 1.306652244456463, + "grad_norm": 5.9375, + "learning_rate": 0.01759128508330835, + "loss": 3.7937, + "mean_token_accuracy": 0.3421509265899658, + "num_tokens": 2471184117.0, + "step": 4832 + }, + { + "epoch": 1.3069226608977826, + "grad_norm": 2.34375, + "learning_rate": 0.01759021153138134, + "loss": 3.5463, + "mean_token_accuracy": 0.35962149500846863, + "num_tokens": 2471690828.0, + "step": 4833 + }, + { + "epoch": 1.3071930773391023, + "grad_norm": 3.25, + "learning_rate": 0.017589137777247254, + "loss": 3.421, + "mean_token_accuracy": 0.36504054069519043, + "num_tokens": 2472215104.0, + "step": 4834 + }, + { + "epoch": 1.307463493780422, + "grad_norm": 2.828125, + "learning_rate": 0.017588063820939043, + "loss": 3.5579, + "mean_token_accuracy": 0.35031214356422424, + "num_tokens": 2472674962.0, + "step": 4835 + }, + { + "epoch": 1.3077339102217413, + "grad_norm": 2.890625, + "learning_rate": 0.01758698966248966, + "loss": 3.0457, + "mean_token_accuracy": 0.3938603401184082, + "num_tokens": 2473198247.0, + "step": 4836 + }, + { + "epoch": 1.308004326663061, + "grad_norm": 2.6875, + "learning_rate": 0.017585915301932067, + "loss": 3.3027, + "mean_token_accuracy": 0.36056703329086304, + "num_tokens": 2473698712.0, + "step": 4837 + }, + { + "epoch": 1.3082747431043806, + "grad_norm": 2.765625, + "learning_rate": 0.017584840739299223, + "loss": 3.0109, + "mean_token_accuracy": 0.38589245080947876, + "num_tokens": 2474222859.0, + "step": 4838 + }, + { + "epoch": 1.3085451595457003, + "grad_norm": 3.09375, + "learning_rate": 0.0175837659746241, + "loss": 3.2944, + "mean_token_accuracy": 0.3768516182899475, + "num_tokens": 2474711222.0, + "step": 4839 + }, + { + "epoch": 1.30881557598702, + "grad_norm": 3.375, + "learning_rate": 0.017582691007939674, + "loss": 3.5992, + "mean_token_accuracy": 0.3328414559364319, + "num_tokens": 2475235401.0, + "step": 4840 + }, + { + "epoch": 1.3090859924283396, + "grad_norm": 2.4375, + "learning_rate": 0.017581615839278933, + "loss": 3.203, + "mean_token_accuracy": 0.38730549812316895, + "num_tokens": 2475759679.0, + "step": 4841 + }, + { + "epoch": 1.3093564088696592, + "grad_norm": 3.140625, + "learning_rate": 0.017580540468674858, + "loss": 3.498, + "mean_token_accuracy": 0.35059481859207153, + "num_tokens": 2476283950.0, + "step": 4842 + }, + { + "epoch": 1.3096268253109788, + "grad_norm": 2.890625, + "learning_rate": 0.017579464896160454, + "loss": 3.2637, + "mean_token_accuracy": 0.3939284682273865, + "num_tokens": 2476804593.0, + "step": 4843 + }, + { + "epoch": 1.3098972417522985, + "grad_norm": 3.015625, + "learning_rate": 0.017578389121768716, + "loss": 3.4738, + "mean_token_accuracy": 0.37619566917419434, + "num_tokens": 2477285094.0, + "step": 4844 + }, + { + "epoch": 1.3101676581936181, + "grad_norm": 3.515625, + "learning_rate": 0.017577313145532653, + "loss": 3.3015, + "mean_token_accuracy": 0.3704300820827484, + "num_tokens": 2477809300.0, + "step": 4845 + }, + { + "epoch": 1.3104380746349378, + "grad_norm": 2.765625, + "learning_rate": 0.017576236967485278, + "loss": 3.3481, + "mean_token_accuracy": 0.34614628553390503, + "num_tokens": 2478333469.0, + "step": 4846 + }, + { + "epoch": 1.3107084910762574, + "grad_norm": 3.6875, + "learning_rate": 0.01757516058765962, + "loss": 3.4081, + "mean_token_accuracy": 0.3857504427433014, + "num_tokens": 2478857643.0, + "step": 4847 + }, + { + "epoch": 1.310978907517577, + "grad_norm": 5.84375, + "learning_rate": 0.017574084006088692, + "loss": 3.7469, + "mean_token_accuracy": 0.33580726385116577, + "num_tokens": 2479381823.0, + "step": 4848 + }, + { + "epoch": 1.3112493239588967, + "grad_norm": 2.828125, + "learning_rate": 0.017573007222805536, + "loss": 3.2446, + "mean_token_accuracy": 0.37301337718963623, + "num_tokens": 2479906107.0, + "step": 4849 + }, + { + "epoch": 1.3115197404002163, + "grad_norm": 3.65625, + "learning_rate": 0.01757193023784319, + "loss": 3.4139, + "mean_token_accuracy": 0.3804069757461548, + "num_tokens": 2480430282.0, + "step": 4850 + }, + { + "epoch": 1.311790156841536, + "grad_norm": 19.375, + "learning_rate": 0.017570853051234698, + "loss": 10.3702, + "mean_token_accuracy": 0.005255518481135368, + "num_tokens": 2480954506.0, + "step": 4851 + }, + { + "epoch": 1.3120605732828556, + "grad_norm": 7.96875, + "learning_rate": 0.01756977566301311, + "loss": 4.2182, + "mean_token_accuracy": 0.2707628011703491, + "num_tokens": 2481426398.0, + "step": 4852 + }, + { + "epoch": 1.3123309897241753, + "grad_norm": 2.78125, + "learning_rate": 0.017568698073211483, + "loss": 3.6956, + "mean_token_accuracy": 0.33197468519210815, + "num_tokens": 2481950677.0, + "step": 4853 + }, + { + "epoch": 1.312601406165495, + "grad_norm": 2.78125, + "learning_rate": 0.017567620281862883, + "loss": 3.2732, + "mean_token_accuracy": 0.3829346299171448, + "num_tokens": 2482474789.0, + "step": 4854 + }, + { + "epoch": 1.3128718226068146, + "grad_norm": 3.203125, + "learning_rate": 0.017566542289000377, + "loss": 3.5708, + "mean_token_accuracy": 0.359002947807312, + "num_tokens": 2482998969.0, + "step": 4855 + }, + { + "epoch": 1.3131422390481342, + "grad_norm": 3.78125, + "learning_rate": 0.017565464094657045, + "loss": 3.3963, + "mean_token_accuracy": 0.3557472229003906, + "num_tokens": 2483523123.0, + "step": 4856 + }, + { + "epoch": 1.3134126554894539, + "grad_norm": 3.546875, + "learning_rate": 0.017564385698865963, + "loss": 3.2956, + "mean_token_accuracy": 0.4224591851234436, + "num_tokens": 2484047344.0, + "step": 4857 + }, + { + "epoch": 1.3136830719307735, + "grad_norm": 2.703125, + "learning_rate": 0.01756330710166023, + "loss": 3.2635, + "mean_token_accuracy": 0.3754483461380005, + "num_tokens": 2484571505.0, + "step": 4858 + }, + { + "epoch": 1.3139534883720931, + "grad_norm": 2.703125, + "learning_rate": 0.017562228303072927, + "loss": 3.316, + "mean_token_accuracy": 0.3785805106163025, + "num_tokens": 2485095644.0, + "step": 4859 + }, + { + "epoch": 1.3142239048134128, + "grad_norm": 2.4375, + "learning_rate": 0.017561149303137163, + "loss": 3.1228, + "mean_token_accuracy": 0.3932493329048157, + "num_tokens": 2485555029.0, + "step": 4860 + }, + { + "epoch": 1.3144943212547324, + "grad_norm": 2.8125, + "learning_rate": 0.017560070101886045, + "loss": 3.2176, + "mean_token_accuracy": 0.4083542823791504, + "num_tokens": 2486079097.0, + "step": 4861 + }, + { + "epoch": 1.3147647376960518, + "grad_norm": 3.328125, + "learning_rate": 0.017558990699352683, + "loss": 3.4311, + "mean_token_accuracy": 0.3718113899230957, + "num_tokens": 2486603289.0, + "step": 4862 + }, + { + "epoch": 1.3150351541373715, + "grad_norm": 3.828125, + "learning_rate": 0.0175579110955702, + "loss": 3.5883, + "mean_token_accuracy": 0.33784744143486023, + "num_tokens": 2487062871.0, + "step": 4863 + }, + { + "epoch": 1.3153055705786911, + "grad_norm": 2.484375, + "learning_rate": 0.017556831290571714, + "loss": 3.2776, + "mean_token_accuracy": 0.3773477077484131, + "num_tokens": 2487587070.0, + "step": 4864 + }, + { + "epoch": 1.3155759870200108, + "grad_norm": 2.390625, + "learning_rate": 0.017555751284390363, + "loss": 3.3201, + "mean_token_accuracy": 0.36411529779434204, + "num_tokens": 2488111160.0, + "step": 4865 + }, + { + "epoch": 1.3158464034613304, + "grad_norm": 2.96875, + "learning_rate": 0.017554671077059287, + "loss": 3.2261, + "mean_token_accuracy": 0.37779170274734497, + "num_tokens": 2488635261.0, + "step": 4866 + }, + { + "epoch": 1.31611681990265, + "grad_norm": 2.5625, + "learning_rate": 0.017553590668611625, + "loss": 3.2473, + "mean_token_accuracy": 0.3888838291168213, + "num_tokens": 2489159540.0, + "step": 4867 + }, + { + "epoch": 1.3163872363439697, + "grad_norm": 3.109375, + "learning_rate": 0.01755251005908053, + "loss": 3.4678, + "mean_token_accuracy": 0.38720953464508057, + "num_tokens": 2489661871.0, + "step": 4868 + }, + { + "epoch": 1.3166576527852893, + "grad_norm": 3.140625, + "learning_rate": 0.017551429248499152, + "loss": 3.0692, + "mean_token_accuracy": 0.3960632085800171, + "num_tokens": 2490186041.0, + "step": 4869 + }, + { + "epoch": 1.316928069226609, + "grad_norm": 3.296875, + "learning_rate": 0.017550348236900663, + "loss": 3.2054, + "mean_token_accuracy": 0.3892267644405365, + "num_tokens": 2490710270.0, + "step": 4870 + }, + { + "epoch": 1.3171984856679286, + "grad_norm": 21.625, + "learning_rate": 0.01754926702431822, + "loss": 12.3419, + "mean_token_accuracy": 0.006582840345799923, + "num_tokens": 2491234511.0, + "step": 4871 + }, + { + "epoch": 1.3174689021092483, + "grad_norm": 7.09375, + "learning_rate": 0.01754818561078501, + "loss": 3.9046, + "mean_token_accuracy": 0.2976349890232086, + "num_tokens": 2491758790.0, + "step": 4872 + }, + { + "epoch": 1.317739318550568, + "grad_norm": 2.296875, + "learning_rate": 0.01754710399633421, + "loss": 3.4271, + "mean_token_accuracy": 0.3684742748737335, + "num_tokens": 2492282963.0, + "step": 4873 + }, + { + "epoch": 1.3180097349918876, + "grad_norm": 3.484375, + "learning_rate": 0.017546022180999002, + "loss": 3.3547, + "mean_token_accuracy": 0.3741878867149353, + "num_tokens": 2492807153.0, + "step": 4874 + }, + { + "epoch": 1.3182801514332072, + "grad_norm": 2.578125, + "learning_rate": 0.017544940164812586, + "loss": 3.3303, + "mean_token_accuracy": 0.3792896568775177, + "num_tokens": 2493296199.0, + "step": 4875 + }, + { + "epoch": 1.3185505678745268, + "grad_norm": 2.515625, + "learning_rate": 0.017543857947808154, + "loss": 3.2978, + "mean_token_accuracy": 0.3730929493904114, + "num_tokens": 2493820452.0, + "step": 4876 + }, + { + "epoch": 1.3188209843158463, + "grad_norm": 2.546875, + "learning_rate": 0.017542775530018918, + "loss": 3.2854, + "mean_token_accuracy": 0.39015382528305054, + "num_tokens": 2494344737.0, + "step": 4877 + }, + { + "epoch": 1.319091400757166, + "grad_norm": 2.453125, + "learning_rate": 0.01754169291147809, + "loss": 3.3086, + "mean_token_accuracy": 0.373887836933136, + "num_tokens": 2494868908.0, + "step": 4878 + }, + { + "epoch": 1.3193618171984856, + "grad_norm": 3.15625, + "learning_rate": 0.017540610092218883, + "loss": 3.1829, + "mean_token_accuracy": 0.4030199646949768, + "num_tokens": 2495376047.0, + "step": 4879 + }, + { + "epoch": 1.3196322336398052, + "grad_norm": 3.796875, + "learning_rate": 0.017539527072274522, + "loss": 3.454, + "mean_token_accuracy": 0.3556607961654663, + "num_tokens": 2495900256.0, + "step": 4880 + }, + { + "epoch": 1.3199026500811248, + "grad_norm": 3.6875, + "learning_rate": 0.01753844385167824, + "loss": 3.4569, + "mean_token_accuracy": 0.37899020314216614, + "num_tokens": 2496424459.0, + "step": 4881 + }, + { + "epoch": 1.3201730665224445, + "grad_norm": 3.546875, + "learning_rate": 0.017537360430463275, + "loss": 3.2258, + "mean_token_accuracy": 0.3549318015575409, + "num_tokens": 2496948724.0, + "step": 4882 + }, + { + "epoch": 1.3204434829637641, + "grad_norm": 2.140625, + "learning_rate": 0.017536276808662866, + "loss": 3.3524, + "mean_token_accuracy": 0.39174342155456543, + "num_tokens": 2497472979.0, + "step": 4883 + }, + { + "epoch": 1.3207138994050838, + "grad_norm": 2.890625, + "learning_rate": 0.01753519298631026, + "loss": 3.3476, + "mean_token_accuracy": 0.3787468671798706, + "num_tokens": 2497959791.0, + "step": 4884 + }, + { + "epoch": 1.3209843158464034, + "grad_norm": 2.96875, + "learning_rate": 0.01753410896343872, + "loss": 3.2308, + "mean_token_accuracy": 0.39695096015930176, + "num_tokens": 2498484056.0, + "step": 4885 + }, + { + "epoch": 1.321254732287723, + "grad_norm": 2.8125, + "learning_rate": 0.017533024740081497, + "loss": 3.272, + "mean_token_accuracy": 0.39355483651161194, + "num_tokens": 2498936667.0, + "step": 4886 + }, + { + "epoch": 1.3215251487290427, + "grad_norm": 2.625, + "learning_rate": 0.017531940316271862, + "loss": 3.1417, + "mean_token_accuracy": 0.3947294056415558, + "num_tokens": 2499460838.0, + "step": 4887 + }, + { + "epoch": 1.3217955651703623, + "grad_norm": 3.28125, + "learning_rate": 0.017530855692043094, + "loss": 3.3411, + "mean_token_accuracy": 0.37896251678466797, + "num_tokens": 2499984194.0, + "step": 4888 + }, + { + "epoch": 1.322065981611682, + "grad_norm": 2.578125, + "learning_rate": 0.01752977086742846, + "loss": 2.9926, + "mean_token_accuracy": 0.39752358198165894, + "num_tokens": 2500508441.0, + "step": 4889 + }, + { + "epoch": 1.3223363980530016, + "grad_norm": 2.890625, + "learning_rate": 0.01752868584246126, + "loss": 3.2443, + "mean_token_accuracy": 0.3922310471534729, + "num_tokens": 2501003578.0, + "step": 4890 + }, + { + "epoch": 1.3226068144943213, + "grad_norm": 6.9375, + "learning_rate": 0.017527600617174774, + "loss": 10.0237, + "mean_token_accuracy": 0.00046716598444618285, + "num_tokens": 2501527848.0, + "step": 4891 + }, + { + "epoch": 1.322877230935641, + "grad_norm": 7.4375, + "learning_rate": 0.017526515191602306, + "loss": 3.8852, + "mean_token_accuracy": 0.33735406398773193, + "num_tokens": 2502047226.0, + "step": 4892 + }, + { + "epoch": 1.3231476473769606, + "grad_norm": 2.421875, + "learning_rate": 0.017525429565777156, + "loss": 3.4335, + "mean_token_accuracy": 0.35024476051330566, + "num_tokens": 2502571396.0, + "step": 4893 + }, + { + "epoch": 1.3234180638182802, + "grad_norm": 3.953125, + "learning_rate": 0.01752434373973264, + "loss": 3.1588, + "mean_token_accuracy": 0.3912854790687561, + "num_tokens": 2503078485.0, + "step": 4894 + }, + { + "epoch": 1.3236884802595998, + "grad_norm": 3.171875, + "learning_rate": 0.017523257713502074, + "loss": 3.376, + "mean_token_accuracy": 0.39635682106018066, + "num_tokens": 2503602529.0, + "step": 4895 + }, + { + "epoch": 1.3239588967009195, + "grad_norm": 5.75, + "learning_rate": 0.017522171487118772, + "loss": 3.3605, + "mean_token_accuracy": 0.3868776857852936, + "num_tokens": 2504126694.0, + "step": 4896 + }, + { + "epoch": 1.3242293131422391, + "grad_norm": 2.015625, + "learning_rate": 0.017521085060616073, + "loss": 3.2541, + "mean_token_accuracy": 0.39719128608703613, + "num_tokens": 2504643247.0, + "step": 4897 + }, + { + "epoch": 1.3244997295835588, + "grad_norm": 2.9375, + "learning_rate": 0.017519998434027306, + "loss": 3.2988, + "mean_token_accuracy": 0.38428232073783875, + "num_tokens": 2505154874.0, + "step": 4898 + }, + { + "epoch": 1.3247701460248784, + "grad_norm": 2.5, + "learning_rate": 0.01751891160738581, + "loss": 3.291, + "mean_token_accuracy": 0.3749633729457855, + "num_tokens": 2505673553.0, + "step": 4899 + }, + { + "epoch": 1.325040562466198, + "grad_norm": 3.5, + "learning_rate": 0.017517824580724938, + "loss": 3.4109, + "mean_token_accuracy": 0.3767700791358948, + "num_tokens": 2506197801.0, + "step": 4900 + }, + { + "epoch": 1.3253109789075177, + "grad_norm": 3.984375, + "learning_rate": 0.01751673735407804, + "loss": 3.2947, + "mean_token_accuracy": 0.3761923909187317, + "num_tokens": 2506697403.0, + "step": 4901 + }, + { + "epoch": 1.3255813953488373, + "grad_norm": 3.0625, + "learning_rate": 0.017515649927478474, + "loss": 3.5248, + "mean_token_accuracy": 0.37908250093460083, + "num_tokens": 2507212739.0, + "step": 4902 + }, + { + "epoch": 1.3258518117901568, + "grad_norm": 4.71875, + "learning_rate": 0.017514562300959605, + "loss": 3.4935, + "mean_token_accuracy": 0.35714203119277954, + "num_tokens": 2507736843.0, + "step": 4903 + }, + { + "epoch": 1.3261222282314764, + "grad_norm": 2.390625, + "learning_rate": 0.01751347447455481, + "loss": 3.1152, + "mean_token_accuracy": 0.3899589776992798, + "num_tokens": 2508261096.0, + "step": 4904 + }, + { + "epoch": 1.326392644672796, + "grad_norm": 3.375, + "learning_rate": 0.017512386448297463, + "loss": 3.4345, + "mean_token_accuracy": 0.3569387197494507, + "num_tokens": 2508785338.0, + "step": 4905 + }, + { + "epoch": 1.3266630611141157, + "grad_norm": 2.484375, + "learning_rate": 0.017511298222220948, + "loss": 3.2173, + "mean_token_accuracy": 0.39618971943855286, + "num_tokens": 2509309474.0, + "step": 4906 + }, + { + "epoch": 1.3269334775554353, + "grad_norm": 3.03125, + "learning_rate": 0.01751020979635865, + "loss": 3.4293, + "mean_token_accuracy": 0.36362120509147644, + "num_tokens": 2509827149.0, + "step": 4907 + }, + { + "epoch": 1.327203893996755, + "grad_norm": 3.5625, + "learning_rate": 0.017509121170743977, + "loss": 2.9702, + "mean_token_accuracy": 0.393230140209198, + "num_tokens": 2510351261.0, + "step": 4908 + }, + { + "epoch": 1.3274743104380746, + "grad_norm": 2.171875, + "learning_rate": 0.01750803234541032, + "loss": 3.2644, + "mean_token_accuracy": 0.38308724761009216, + "num_tokens": 2510875481.0, + "step": 4909 + }, + { + "epoch": 1.3277447268793943, + "grad_norm": 3.125, + "learning_rate": 0.017506943320391096, + "loss": 3.5106, + "mean_token_accuracy": 0.36329448223114014, + "num_tokens": 2511399740.0, + "step": 4910 + }, + { + "epoch": 1.328015143320714, + "grad_norm": 20.5, + "learning_rate": 0.017505854095719715, + "loss": 10.3333, + "mean_token_accuracy": 0.01133880577981472, + "num_tokens": 2511923859.0, + "step": 4911 + }, + { + "epoch": 1.3282855597620336, + "grad_norm": 6.78125, + "learning_rate": 0.017504764671429595, + "loss": 3.7679, + "mean_token_accuracy": 0.328421026468277, + "num_tokens": 2512448057.0, + "step": 4912 + }, + { + "epoch": 1.3285559762033532, + "grad_norm": 1.953125, + "learning_rate": 0.01750367504755417, + "loss": 3.3132, + "mean_token_accuracy": 0.391304075717926, + "num_tokens": 2512920998.0, + "step": 4913 + }, + { + "epoch": 1.3288263926446728, + "grad_norm": 2.640625, + "learning_rate": 0.01750258522412686, + "loss": 3.3626, + "mean_token_accuracy": 0.37262898683547974, + "num_tokens": 2513445192.0, + "step": 4914 + }, + { + "epoch": 1.3290968090859925, + "grad_norm": 3.125, + "learning_rate": 0.017501495201181123, + "loss": 3.3803, + "mean_token_accuracy": 0.3754348158836365, + "num_tokens": 2513908298.0, + "step": 4915 + }, + { + "epoch": 1.3293672255273121, + "grad_norm": 3.515625, + "learning_rate": 0.017500404978750383, + "loss": 3.3431, + "mean_token_accuracy": 0.4029353857040405, + "num_tokens": 2514382618.0, + "step": 4916 + }, + { + "epoch": 1.3296376419686318, + "grad_norm": 3.265625, + "learning_rate": 0.01749931455686811, + "loss": 3.3941, + "mean_token_accuracy": 0.3503761291503906, + "num_tokens": 2514906812.0, + "step": 4917 + }, + { + "epoch": 1.3299080584099512, + "grad_norm": 2.46875, + "learning_rate": 0.01749822393556775, + "loss": 3.2684, + "mean_token_accuracy": 0.3908323049545288, + "num_tokens": 2515431084.0, + "step": 4918 + }, + { + "epoch": 1.3301784748512708, + "grad_norm": 2.71875, + "learning_rate": 0.017497133114882768, + "loss": 3.2407, + "mean_token_accuracy": 0.36611542105674744, + "num_tokens": 2515955271.0, + "step": 4919 + }, + { + "epoch": 1.3304488912925905, + "grad_norm": 2.984375, + "learning_rate": 0.017496042094846637, + "loss": 3.4256, + "mean_token_accuracy": 0.3680223822593689, + "num_tokens": 2516479432.0, + "step": 4920 + }, + { + "epoch": 1.3307193077339101, + "grad_norm": 3.671875, + "learning_rate": 0.01749495087549283, + "loss": 3.4191, + "mean_token_accuracy": 0.38494282960891724, + "num_tokens": 2517003625.0, + "step": 4921 + }, + { + "epoch": 1.3309897241752298, + "grad_norm": 2.859375, + "learning_rate": 0.017493859456854832, + "loss": 3.3527, + "mean_token_accuracy": 0.4346138834953308, + "num_tokens": 2517453770.0, + "step": 4922 + }, + { + "epoch": 1.3312601406165494, + "grad_norm": 4.34375, + "learning_rate": 0.017492767838966125, + "loss": 3.6086, + "mean_token_accuracy": 0.36261218786239624, + "num_tokens": 2517978048.0, + "step": 4923 + }, + { + "epoch": 1.331530557057869, + "grad_norm": 3.0, + "learning_rate": 0.01749167602186021, + "loss": 3.119, + "mean_token_accuracy": 0.39736640453338623, + "num_tokens": 2518502310.0, + "step": 4924 + }, + { + "epoch": 1.3318009734991887, + "grad_norm": 3.1875, + "learning_rate": 0.017490584005570582, + "loss": 3.5179, + "mean_token_accuracy": 0.3750889003276825, + "num_tokens": 2518985450.0, + "step": 4925 + }, + { + "epoch": 1.3320713899405083, + "grad_norm": 3.15625, + "learning_rate": 0.017489491790130752, + "loss": 3.3031, + "mean_token_accuracy": 0.39054304361343384, + "num_tokens": 2519497736.0, + "step": 4926 + }, + { + "epoch": 1.332341806381828, + "grad_norm": 3.3125, + "learning_rate": 0.017488399375574226, + "loss": 3.2253, + "mean_token_accuracy": 0.3773902654647827, + "num_tokens": 2520021973.0, + "step": 4927 + }, + { + "epoch": 1.3326122228231476, + "grad_norm": 2.796875, + "learning_rate": 0.017487306761934527, + "loss": 3.3529, + "mean_token_accuracy": 0.40044304728507996, + "num_tokens": 2520546151.0, + "step": 4928 + }, + { + "epoch": 1.3328826392644673, + "grad_norm": 3.703125, + "learning_rate": 0.017486213949245183, + "loss": 3.0107, + "mean_token_accuracy": 0.38337087631225586, + "num_tokens": 2521025171.0, + "step": 4929 + }, + { + "epoch": 1.333153055705787, + "grad_norm": 2.3125, + "learning_rate": 0.017485120937539712, + "loss": 3.2033, + "mean_token_accuracy": 0.4165497124195099, + "num_tokens": 2521499900.0, + "step": 4930 + }, + { + "epoch": 1.3334234721471065, + "grad_norm": 13.4375, + "learning_rate": 0.01748402772685167, + "loss": 10.989, + "mean_token_accuracy": 2.768304057099158e-06, + "num_tokens": 2522024137.0, + "step": 4931 + }, + { + "epoch": 1.3336938885884262, + "grad_norm": 8.0, + "learning_rate": 0.01748293431721458, + "loss": 4.0644, + "mean_token_accuracy": 0.2863488793373108, + "num_tokens": 2522491535.0, + "step": 4932 + }, + { + "epoch": 1.3339643050297458, + "grad_norm": 2.609375, + "learning_rate": 0.017481840708662, + "loss": 3.309, + "mean_token_accuracy": 0.3647800087928772, + "num_tokens": 2523015688.0, + "step": 4933 + }, + { + "epoch": 1.3342347214710655, + "grad_norm": 2.671875, + "learning_rate": 0.01748074690122749, + "loss": 3.4957, + "mean_token_accuracy": 0.371839314699173, + "num_tokens": 2523539708.0, + "step": 4934 + }, + { + "epoch": 1.3345051379123851, + "grad_norm": 2.609375, + "learning_rate": 0.017479652894944603, + "loss": 3.3586, + "mean_token_accuracy": 0.3698738217353821, + "num_tokens": 2524063953.0, + "step": 4935 + }, + { + "epoch": 1.3347755543537048, + "grad_norm": 2.484375, + "learning_rate": 0.01747855868984691, + "loss": 3.1892, + "mean_token_accuracy": 0.39027076959609985, + "num_tokens": 2524551546.0, + "step": 4936 + }, + { + "epoch": 1.3350459707950244, + "grad_norm": 2.625, + "learning_rate": 0.017477464285967987, + "loss": 3.1082, + "mean_token_accuracy": 0.392936646938324, + "num_tokens": 2525075732.0, + "step": 4937 + }, + { + "epoch": 1.335316387236344, + "grad_norm": 2.546875, + "learning_rate": 0.01747636968334141, + "loss": 3.1636, + "mean_token_accuracy": 0.38886648416519165, + "num_tokens": 2525599962.0, + "step": 4938 + }, + { + "epoch": 1.3355868036776637, + "grad_norm": 2.734375, + "learning_rate": 0.017475274882000763, + "loss": 3.4204, + "mean_token_accuracy": 0.371009886264801, + "num_tokens": 2526124100.0, + "step": 4939 + }, + { + "epoch": 1.3358572201189833, + "grad_norm": 3.484375, + "learning_rate": 0.01747417988197964, + "loss": 3.3462, + "mean_token_accuracy": 0.394237220287323, + "num_tokens": 2526648173.0, + "step": 4940 + }, + { + "epoch": 1.336127636560303, + "grad_norm": 2.5, + "learning_rate": 0.017473084683311643, + "loss": 3.1648, + "mean_token_accuracy": 0.4091605544090271, + "num_tokens": 2527172415.0, + "step": 4941 + }, + { + "epoch": 1.3363980530016226, + "grad_norm": 3.296875, + "learning_rate": 0.01747198928603037, + "loss": 3.258, + "mean_token_accuracy": 0.374664843082428, + "num_tokens": 2527660672.0, + "step": 4942 + }, + { + "epoch": 1.3366684694429423, + "grad_norm": 3.390625, + "learning_rate": 0.01747089369016943, + "loss": 3.1256, + "mean_token_accuracy": 0.3748396039009094, + "num_tokens": 2528142614.0, + "step": 4943 + }, + { + "epoch": 1.3369388858842617, + "grad_norm": 2.609375, + "learning_rate": 0.017469797895762442, + "loss": 3.3906, + "mean_token_accuracy": 0.390780508518219, + "num_tokens": 2528617464.0, + "step": 4944 + }, + { + "epoch": 1.3372093023255813, + "grad_norm": 3.09375, + "learning_rate": 0.017468701902843033, + "loss": 3.2195, + "mean_token_accuracy": 0.38938936591148376, + "num_tokens": 2529141584.0, + "step": 4945 + }, + { + "epoch": 1.337479718766901, + "grad_norm": 2.578125, + "learning_rate": 0.017467605711444824, + "loss": 3.2008, + "mean_token_accuracy": 0.3947729468345642, + "num_tokens": 2529640672.0, + "step": 4946 + }, + { + "epoch": 1.3377501352082206, + "grad_norm": 3.71875, + "learning_rate": 0.01746650932160145, + "loss": 3.4556, + "mean_token_accuracy": 0.37499886751174927, + "num_tokens": 2530160911.0, + "step": 4947 + }, + { + "epoch": 1.3380205516495403, + "grad_norm": 3.671875, + "learning_rate": 0.017465412733346557, + "loss": 3.3605, + "mean_token_accuracy": 0.36784881353378296, + "num_tokens": 2530685063.0, + "step": 4948 + }, + { + "epoch": 1.33829096809086, + "grad_norm": 2.15625, + "learning_rate": 0.01746431594671378, + "loss": 3.2183, + "mean_token_accuracy": 0.3677518963813782, + "num_tokens": 2531193417.0, + "step": 4949 + }, + { + "epoch": 1.3385613845321795, + "grad_norm": 2.65625, + "learning_rate": 0.01746321896173679, + "loss": 3.4455, + "mean_token_accuracy": 0.37190937995910645, + "num_tokens": 2531717656.0, + "step": 4950 + }, + { + "epoch": 1.3388318009734992, + "grad_norm": 14.625, + "learning_rate": 0.01746212177844923, + "loss": 9.7271, + "mean_token_accuracy": 0.015564600005745888, + "num_tokens": 2532225139.0, + "step": 4951 + }, + { + "epoch": 1.3391022174148188, + "grad_norm": 6.125, + "learning_rate": 0.017461024396884767, + "loss": 3.3907, + "mean_token_accuracy": 0.3800164461135864, + "num_tokens": 2532749347.0, + "step": 4952 + }, + { + "epoch": 1.3393726338561385, + "grad_norm": 2.421875, + "learning_rate": 0.017459926817077077, + "loss": 3.3575, + "mean_token_accuracy": 0.39366596937179565, + "num_tokens": 2533273496.0, + "step": 4953 + }, + { + "epoch": 1.3396430502974581, + "grad_norm": 4.03125, + "learning_rate": 0.017458829039059837, + "loss": 3.2732, + "mean_token_accuracy": 0.3605104088783264, + "num_tokens": 2533797589.0, + "step": 4954 + }, + { + "epoch": 1.3399134667387778, + "grad_norm": 3.0625, + "learning_rate": 0.017457731062866726, + "loss": 3.3568, + "mean_token_accuracy": 0.3707055449485779, + "num_tokens": 2534321863.0, + "step": 4955 + }, + { + "epoch": 1.3401838831800974, + "grad_norm": 2.578125, + "learning_rate": 0.017456632888531436, + "loss": 3.279, + "mean_token_accuracy": 0.361366868019104, + "num_tokens": 2534846107.0, + "step": 4956 + }, + { + "epoch": 1.340454299621417, + "grad_norm": 2.890625, + "learning_rate": 0.01745553451608766, + "loss": 3.3817, + "mean_token_accuracy": 0.3763299584388733, + "num_tokens": 2535325960.0, + "step": 4957 + }, + { + "epoch": 1.3407247160627367, + "grad_norm": 2.84375, + "learning_rate": 0.017454435945569103, + "loss": 3.4568, + "mean_token_accuracy": 0.34518083930015564, + "num_tokens": 2535850119.0, + "step": 4958 + }, + { + "epoch": 1.340995132504056, + "grad_norm": 2.71875, + "learning_rate": 0.017453337177009465, + "loss": 3.3868, + "mean_token_accuracy": 0.38315802812576294, + "num_tokens": 2536374378.0, + "step": 4959 + }, + { + "epoch": 1.3412655489453758, + "grad_norm": 2.875, + "learning_rate": 0.01745223821044247, + "loss": 3.4135, + "mean_token_accuracy": 0.33939114212989807, + "num_tokens": 2536898435.0, + "step": 4960 + }, + { + "epoch": 1.3415359653866954, + "grad_norm": 2.890625, + "learning_rate": 0.01745113904590183, + "loss": 3.4612, + "mean_token_accuracy": 0.35476964712142944, + "num_tokens": 2537415642.0, + "step": 4961 + }, + { + "epoch": 1.341806381828015, + "grad_norm": 2.78125, + "learning_rate": 0.01745003968342127, + "loss": 3.3372, + "mean_token_accuracy": 0.3605303168296814, + "num_tokens": 2537880864.0, + "step": 4962 + }, + { + "epoch": 1.3420767982693347, + "grad_norm": 2.640625, + "learning_rate": 0.01744894012303453, + "loss": 3.5089, + "mean_token_accuracy": 0.38407230377197266, + "num_tokens": 2538365049.0, + "step": 4963 + }, + { + "epoch": 1.3423472147106543, + "grad_norm": 3.375, + "learning_rate": 0.017447840364775338, + "loss": 3.543, + "mean_token_accuracy": 0.3663215935230255, + "num_tokens": 2538889325.0, + "step": 4964 + }, + { + "epoch": 1.342617631151974, + "grad_norm": 2.875, + "learning_rate": 0.017446740408677438, + "loss": 3.2573, + "mean_token_accuracy": 0.3700483441352844, + "num_tokens": 2539413604.0, + "step": 4965 + }, + { + "epoch": 1.3428880475932936, + "grad_norm": 3.046875, + "learning_rate": 0.017445640254774592, + "loss": 3.2035, + "mean_token_accuracy": 0.39334508776664734, + "num_tokens": 2539937844.0, + "step": 4966 + }, + { + "epoch": 1.3431584640346133, + "grad_norm": 3.125, + "learning_rate": 0.01744453990310054, + "loss": 3.3039, + "mean_token_accuracy": 0.3682275414466858, + "num_tokens": 2540462118.0, + "step": 4967 + }, + { + "epoch": 1.343428880475933, + "grad_norm": 16.75, + "learning_rate": 0.017443439353689055, + "loss": 3.1173, + "mean_token_accuracy": 0.44042307138442993, + "num_tokens": 2540986132.0, + "step": 4968 + }, + { + "epoch": 1.3436992969172525, + "grad_norm": 2.875, + "learning_rate": 0.0174423386065739, + "loss": 3.3022, + "mean_token_accuracy": 0.388779878616333, + "num_tokens": 2541467570.0, + "step": 4969 + }, + { + "epoch": 1.3439697133585722, + "grad_norm": 1.8203125, + "learning_rate": 0.017441237661788855, + "loss": 3.1847, + "mean_token_accuracy": 0.39906930923461914, + "num_tokens": 2541894967.0, + "step": 4970 + }, + { + "epoch": 1.3442401297998918, + "grad_norm": 17.625, + "learning_rate": 0.017440136519367697, + "loss": 13.6857, + "mean_token_accuracy": 0.011324816383421421, + "num_tokens": 2542419181.0, + "step": 4971 + }, + { + "epoch": 1.3445105462412115, + "grad_norm": 7.1875, + "learning_rate": 0.0174390351793442, + "loss": 3.904, + "mean_token_accuracy": 0.3118392527103424, + "num_tokens": 2542943373.0, + "step": 4972 + }, + { + "epoch": 1.3447809626825311, + "grad_norm": 2.421875, + "learning_rate": 0.01743793364175218, + "loss": 3.5243, + "mean_token_accuracy": 0.36257490515708923, + "num_tokens": 2543467595.0, + "step": 4973 + }, + { + "epoch": 1.3450513791238508, + "grad_norm": 3.453125, + "learning_rate": 0.017436831906625417, + "loss": 3.4546, + "mean_token_accuracy": 0.35012489557266235, + "num_tokens": 2543991793.0, + "step": 4974 + }, + { + "epoch": 1.3453217955651704, + "grad_norm": 2.796875, + "learning_rate": 0.017435729973997723, + "loss": 3.1682, + "mean_token_accuracy": 0.3957253694534302, + "num_tokens": 2544515976.0, + "step": 4975 + }, + { + "epoch": 1.34559221200649, + "grad_norm": 2.96875, + "learning_rate": 0.017434627843902907, + "loss": 3.3803, + "mean_token_accuracy": 0.3839380145072937, + "num_tokens": 2544980339.0, + "step": 4976 + }, + { + "epoch": 1.3458626284478097, + "grad_norm": 2.890625, + "learning_rate": 0.01743352551637479, + "loss": 3.3581, + "mean_token_accuracy": 0.39471256732940674, + "num_tokens": 2545504521.0, + "step": 4977 + }, + { + "epoch": 1.3461330448891293, + "grad_norm": 3.03125, + "learning_rate": 0.017432422991447184, + "loss": 3.2202, + "mean_token_accuracy": 0.37574562430381775, + "num_tokens": 2546028795.0, + "step": 4978 + }, + { + "epoch": 1.346403461330449, + "grad_norm": 3.34375, + "learning_rate": 0.017431320269153925, + "loss": 3.2204, + "mean_token_accuracy": 0.38984414935112, + "num_tokens": 2546552712.0, + "step": 4979 + }, + { + "epoch": 1.3466738777717686, + "grad_norm": 2.5625, + "learning_rate": 0.01743021734952885, + "loss": 3.1798, + "mean_token_accuracy": 0.3889608383178711, + "num_tokens": 2547076993.0, + "step": 4980 + }, + { + "epoch": 1.3469442942130883, + "grad_norm": 3.109375, + "learning_rate": 0.017429114232605795, + "loss": 3.5888, + "mean_token_accuracy": 0.35442325472831726, + "num_tokens": 2547591386.0, + "step": 4981 + }, + { + "epoch": 1.347214710654408, + "grad_norm": 2.90625, + "learning_rate": 0.01742801091841861, + "loss": 3.4184, + "mean_token_accuracy": 0.38306596875190735, + "num_tokens": 2548115565.0, + "step": 4982 + }, + { + "epoch": 1.3474851270957275, + "grad_norm": 3.5, + "learning_rate": 0.017426907407001146, + "loss": 3.3677, + "mean_token_accuracy": 0.3841559886932373, + "num_tokens": 2548639644.0, + "step": 4983 + }, + { + "epoch": 1.3477555435370472, + "grad_norm": 2.625, + "learning_rate": 0.01742580369838726, + "loss": 3.1622, + "mean_token_accuracy": 0.4027040898799896, + "num_tokens": 2549128370.0, + "step": 4984 + }, + { + "epoch": 1.3480259599783666, + "grad_norm": 3.203125, + "learning_rate": 0.017424699792610823, + "loss": 3.5019, + "mean_token_accuracy": 0.36039113998413086, + "num_tokens": 2549652648.0, + "step": 4985 + }, + { + "epoch": 1.3482963764196862, + "grad_norm": 2.859375, + "learning_rate": 0.0174235956897057, + "loss": 3.3864, + "mean_token_accuracy": 0.3885677456855774, + "num_tokens": 2550159750.0, + "step": 4986 + }, + { + "epoch": 1.348566792861006, + "grad_norm": 3.015625, + "learning_rate": 0.017422491389705773, + "loss": 3.4348, + "mean_token_accuracy": 0.3856317400932312, + "num_tokens": 2550684029.0, + "step": 4987 + }, + { + "epoch": 1.3488372093023255, + "grad_norm": 2.921875, + "learning_rate": 0.01742138689264492, + "loss": 3.2942, + "mean_token_accuracy": 0.3995109796524048, + "num_tokens": 2551145579.0, + "step": 4988 + }, + { + "epoch": 1.3491076257436452, + "grad_norm": 3.3125, + "learning_rate": 0.017420282198557038, + "loss": 3.4769, + "mean_token_accuracy": 0.36641305685043335, + "num_tokens": 2551643112.0, + "step": 4989 + }, + { + "epoch": 1.3493780421849648, + "grad_norm": 3.203125, + "learning_rate": 0.017419177307476007, + "loss": 3.3924, + "mean_token_accuracy": 0.3672167658805847, + "num_tokens": 2552167318.0, + "step": 4990 + }, + { + "epoch": 1.3496484586262845, + "grad_norm": 2.671875, + "learning_rate": 0.017418072219435747, + "loss": 10.676, + "mean_token_accuracy": 0.0, + "num_tokens": 2552662286.0, + "step": 4991 + }, + { + "epoch": 1.349918875067604, + "grad_norm": 5.4375, + "learning_rate": 0.017416966934470157, + "loss": 3.358, + "mean_token_accuracy": 0.330081045627594, + "num_tokens": 2553186501.0, + "step": 4992 + }, + { + "epoch": 1.3501892915089238, + "grad_norm": 2.75, + "learning_rate": 0.017415861452613146, + "loss": 3.4241, + "mean_token_accuracy": 0.38067370653152466, + "num_tokens": 2553692136.0, + "step": 4993 + }, + { + "epoch": 1.3504597079502434, + "grad_norm": 3.421875, + "learning_rate": 0.01741475577389864, + "loss": 3.4121, + "mean_token_accuracy": 0.35914143919944763, + "num_tokens": 2554216272.0, + "step": 4994 + }, + { + "epoch": 1.350730124391563, + "grad_norm": 2.9375, + "learning_rate": 0.017413649898360566, + "loss": 3.3419, + "mean_token_accuracy": 0.39551877975463867, + "num_tokens": 2554647442.0, + "step": 4995 + }, + { + "epoch": 1.3510005408328827, + "grad_norm": 2.734375, + "learning_rate": 0.01741254382603285, + "loss": 3.1407, + "mean_token_accuracy": 0.39035671949386597, + "num_tokens": 2555171714.0, + "step": 4996 + }, + { + "epoch": 1.3512709572742023, + "grad_norm": 2.859375, + "learning_rate": 0.01741143755694943, + "loss": 3.3915, + "mean_token_accuracy": 0.3705179691314697, + "num_tokens": 2555695758.0, + "step": 4997 + }, + { + "epoch": 1.351541373715522, + "grad_norm": 3.109375, + "learning_rate": 0.01741033109114425, + "loss": 3.4486, + "mean_token_accuracy": 0.37876594066619873, + "num_tokens": 2556165538.0, + "step": 4998 + }, + { + "epoch": 1.3518117901568416, + "grad_norm": 3.453125, + "learning_rate": 0.01740922442865126, + "loss": 3.4364, + "mean_token_accuracy": 0.38343366980552673, + "num_tokens": 2556689753.0, + "step": 4999 + }, + { + "epoch": 1.3520822065981613, + "grad_norm": 2.671875, + "learning_rate": 0.017408117569504417, + "loss": 3.2665, + "mean_token_accuracy": 0.38777029514312744, + "num_tokens": 2557213946.0, + "step": 5000 + }, + { + "epoch": 1.3523526230394807, + "grad_norm": 3.5625, + "learning_rate": 0.017407010513737685, + "loss": 3.3832, + "mean_token_accuracy": 0.36609840393066406, + "num_tokens": 2557738117.0, + "step": 5001 + }, + { + "epoch": 1.3526230394808003, + "grad_norm": 2.953125, + "learning_rate": 0.01740590326138502, + "loss": 3.3257, + "mean_token_accuracy": 0.38026857376098633, + "num_tokens": 2558262289.0, + "step": 5002 + }, + { + "epoch": 1.35289345592212, + "grad_norm": 3.875, + "learning_rate": 0.017404795812480413, + "loss": 3.3869, + "mean_token_accuracy": 0.37090638279914856, + "num_tokens": 2558786557.0, + "step": 5003 + }, + { + "epoch": 1.3531638723634396, + "grad_norm": 2.5625, + "learning_rate": 0.017403688167057837, + "loss": 3.3112, + "mean_token_accuracy": 0.3779306411743164, + "num_tokens": 2559310823.0, + "step": 5004 + }, + { + "epoch": 1.3534342888047592, + "grad_norm": 3.15625, + "learning_rate": 0.017402580325151268, + "loss": 3.4032, + "mean_token_accuracy": 0.3626571297645569, + "num_tokens": 2559777391.0, + "step": 5005 + }, + { + "epoch": 1.3537047052460789, + "grad_norm": 2.78125, + "learning_rate": 0.01740147228679471, + "loss": 3.2351, + "mean_token_accuracy": 0.38499316573143005, + "num_tokens": 2560301579.0, + "step": 5006 + }, + { + "epoch": 1.3539751216873985, + "grad_norm": 3.21875, + "learning_rate": 0.01740036405202216, + "loss": 3.272, + "mean_token_accuracy": 0.37283968925476074, + "num_tokens": 2560825796.0, + "step": 5007 + }, + { + "epoch": 1.3542455381287182, + "grad_norm": 2.53125, + "learning_rate": 0.01739925562086761, + "loss": 3.2722, + "mean_token_accuracy": 0.366573303937912, + "num_tokens": 2561350045.0, + "step": 5008 + }, + { + "epoch": 1.3545159545700378, + "grad_norm": 2.90625, + "learning_rate": 0.017398146993365088, + "loss": 3.081, + "mean_token_accuracy": 0.38994935154914856, + "num_tokens": 2561874296.0, + "step": 5009 + }, + { + "epoch": 1.3547863710113575, + "grad_norm": 2.765625, + "learning_rate": 0.017397038169548594, + "loss": 3.2644, + "mean_token_accuracy": 0.36998578906059265, + "num_tokens": 2562398441.0, + "step": 5010 + }, + { + "epoch": 1.355056787452677, + "grad_norm": 13.6875, + "learning_rate": 0.01739592914945216, + "loss": 12.6936, + "mean_token_accuracy": 0.01110551506280899, + "num_tokens": 2562922712.0, + "step": 5011 + }, + { + "epoch": 1.3553272038939967, + "grad_norm": 8.125, + "learning_rate": 0.01739481993310981, + "loss": 3.869, + "mean_token_accuracy": 0.2947724461555481, + "num_tokens": 2563446926.0, + "step": 5012 + }, + { + "epoch": 1.3555976203353164, + "grad_norm": 2.453125, + "learning_rate": 0.017393710520555574, + "loss": 3.4729, + "mean_token_accuracy": 0.3652299642562866, + "num_tokens": 2563964836.0, + "step": 5013 + }, + { + "epoch": 1.355868036776636, + "grad_norm": 2.234375, + "learning_rate": 0.017392600911823503, + "loss": 3.089, + "mean_token_accuracy": 0.3995199501514435, + "num_tokens": 2564489007.0, + "step": 5014 + }, + { + "epoch": 1.3561384532179557, + "grad_norm": 3.1875, + "learning_rate": 0.017391491106947633, + "loss": 3.3463, + "mean_token_accuracy": 0.36046314239501953, + "num_tokens": 2565013148.0, + "step": 5015 + }, + { + "epoch": 1.3564088696592753, + "grad_norm": 2.5, + "learning_rate": 0.01739038110596202, + "loss": 3.1548, + "mean_token_accuracy": 0.39829716086387634, + "num_tokens": 2565537395.0, + "step": 5016 + }, + { + "epoch": 1.356679286100595, + "grad_norm": 2.84375, + "learning_rate": 0.017389270908900724, + "loss": 3.2235, + "mean_token_accuracy": 0.3700704872608185, + "num_tokens": 2566061611.0, + "step": 5017 + }, + { + "epoch": 1.3569497025419146, + "grad_norm": 2.671875, + "learning_rate": 0.017388160515797804, + "loss": 3.2136, + "mean_token_accuracy": 0.3876889944076538, + "num_tokens": 2566585635.0, + "step": 5018 + }, + { + "epoch": 1.3572201189832342, + "grad_norm": 2.734375, + "learning_rate": 0.017387049926687336, + "loss": 3.4446, + "mean_token_accuracy": 0.3688647747039795, + "num_tokens": 2567109917.0, + "step": 5019 + }, + { + "epoch": 1.357490535424554, + "grad_norm": 3.390625, + "learning_rate": 0.01738593914160339, + "loss": 3.4552, + "mean_token_accuracy": 0.391364723443985, + "num_tokens": 2567592744.0, + "step": 5020 + }, + { + "epoch": 1.3577609518658735, + "grad_norm": 2.59375, + "learning_rate": 0.01738482816058005, + "loss": 3.2337, + "mean_token_accuracy": 0.38191479444503784, + "num_tokens": 2568116756.0, + "step": 5021 + }, + { + "epoch": 1.3580313683071932, + "grad_norm": 2.8125, + "learning_rate": 0.01738371698365141, + "loss": 3.306, + "mean_token_accuracy": 0.36972126364707947, + "num_tokens": 2568641024.0, + "step": 5022 + }, + { + "epoch": 1.3583017847485128, + "grad_norm": 3.21875, + "learning_rate": 0.01738260561085156, + "loss": 3.0861, + "mean_token_accuracy": 0.3858376145362854, + "num_tokens": 2569165292.0, + "step": 5023 + }, + { + "epoch": 1.3585722011898325, + "grad_norm": 2.71875, + "learning_rate": 0.017381494042214597, + "loss": 3.2883, + "mean_token_accuracy": 0.36771053075790405, + "num_tokens": 2569689465.0, + "step": 5024 + }, + { + "epoch": 1.358842617631152, + "grad_norm": 3.453125, + "learning_rate": 0.01738038227777463, + "loss": 3.5162, + "mean_token_accuracy": 0.36992040276527405, + "num_tokens": 2570213667.0, + "step": 5025 + }, + { + "epoch": 1.3591130340724715, + "grad_norm": 3.46875, + "learning_rate": 0.01737927031756577, + "loss": 2.6855, + "mean_token_accuracy": 0.46862149238586426, + "num_tokens": 2570737829.0, + "step": 5026 + }, + { + "epoch": 1.3593834505137912, + "grad_norm": 2.359375, + "learning_rate": 0.01737815816162214, + "loss": 3.4113, + "mean_token_accuracy": 0.3894771635532379, + "num_tokens": 2571261996.0, + "step": 5027 + }, + { + "epoch": 1.3596538669551108, + "grad_norm": 3.078125, + "learning_rate": 0.01737704580997786, + "loss": 3.2416, + "mean_token_accuracy": 0.36410510540008545, + "num_tokens": 2571786065.0, + "step": 5028 + }, + { + "epoch": 1.3599242833964305, + "grad_norm": 3.296875, + "learning_rate": 0.01737593326266706, + "loss": 3.3347, + "mean_token_accuracy": 0.37199506163597107, + "num_tokens": 2572310328.0, + "step": 5029 + }, + { + "epoch": 1.36019469983775, + "grad_norm": 3.34375, + "learning_rate": 0.017374820519723878, + "loss": 3.3656, + "mean_token_accuracy": 0.35854867100715637, + "num_tokens": 2572834398.0, + "step": 5030 + }, + { + "epoch": 1.3604651162790697, + "grad_norm": 22.25, + "learning_rate": 0.01737370758118245, + "loss": 14.0495, + "mean_token_accuracy": 4.871015335083939e-06, + "num_tokens": 2573358645.0, + "step": 5031 + }, + { + "epoch": 1.3607355327203894, + "grad_norm": 5.28125, + "learning_rate": 0.017372594447076936, + "loss": 3.612, + "mean_token_accuracy": 0.355557382106781, + "num_tokens": 2573882725.0, + "step": 5032 + }, + { + "epoch": 1.361005949161709, + "grad_norm": 2.484375, + "learning_rate": 0.01737148111744148, + "loss": 3.2605, + "mean_token_accuracy": 0.3744898736476898, + "num_tokens": 2574406935.0, + "step": 5033 + }, + { + "epoch": 1.3612763656030287, + "grad_norm": 3.171875, + "learning_rate": 0.01737036759231025, + "loss": 3.4055, + "mean_token_accuracy": 0.35454338788986206, + "num_tokens": 2574890593.0, + "step": 5034 + }, + { + "epoch": 1.3615467820443483, + "grad_norm": 3.171875, + "learning_rate": 0.017369253871717408, + "loss": 3.2843, + "mean_token_accuracy": 0.4133300185203552, + "num_tokens": 2575414871.0, + "step": 5035 + }, + { + "epoch": 1.361817198485668, + "grad_norm": 5.3125, + "learning_rate": 0.017368139955697123, + "loss": 3.1162, + "mean_token_accuracy": 0.4114815592765808, + "num_tokens": 2575939106.0, + "step": 5036 + }, + { + "epoch": 1.3620876149269876, + "grad_norm": 2.625, + "learning_rate": 0.01736702584428358, + "loss": 3.3588, + "mean_token_accuracy": 0.38523367047309875, + "num_tokens": 2576463280.0, + "step": 5037 + }, + { + "epoch": 1.3623580313683072, + "grad_norm": 2.484375, + "learning_rate": 0.017365911537510954, + "loss": 3.1145, + "mean_token_accuracy": 0.3998510241508484, + "num_tokens": 2576912102.0, + "step": 5038 + }, + { + "epoch": 1.3626284478096269, + "grad_norm": 2.234375, + "learning_rate": 0.017364797035413448, + "loss": 3.4322, + "mean_token_accuracy": 0.3676799237728119, + "num_tokens": 2577436360.0, + "step": 5039 + }, + { + "epoch": 1.3628988642509465, + "grad_norm": 3.484375, + "learning_rate": 0.017363682338025245, + "loss": 3.5384, + "mean_token_accuracy": 0.3636931777000427, + "num_tokens": 2577948723.0, + "step": 5040 + }, + { + "epoch": 1.3631692806922662, + "grad_norm": 2.359375, + "learning_rate": 0.017362567445380558, + "loss": 3.1155, + "mean_token_accuracy": 0.38498350977897644, + "num_tokens": 2578472983.0, + "step": 5041 + }, + { + "epoch": 1.3634396971335856, + "grad_norm": 3.4375, + "learning_rate": 0.01736145235751359, + "loss": 3.4594, + "mean_token_accuracy": 0.3654102385044098, + "num_tokens": 2578997255.0, + "step": 5042 + }, + { + "epoch": 1.3637101135749052, + "grad_norm": 4.28125, + "learning_rate": 0.017360337074458554, + "loss": 3.2597, + "mean_token_accuracy": 0.39403632283210754, + "num_tokens": 2579521486.0, + "step": 5043 + }, + { + "epoch": 1.3639805300162249, + "grad_norm": 3.015625, + "learning_rate": 0.017359221596249676, + "loss": 3.4503, + "mean_token_accuracy": 0.3662375807762146, + "num_tokens": 2580044522.0, + "step": 5044 + }, + { + "epoch": 1.3642509464575445, + "grad_norm": 3.0625, + "learning_rate": 0.017358105922921176, + "loss": 3.2539, + "mean_token_accuracy": 0.3661637008190155, + "num_tokens": 2580546846.0, + "step": 5045 + }, + { + "epoch": 1.3645213628988642, + "grad_norm": 2.3125, + "learning_rate": 0.017356990054507287, + "loss": 3.2812, + "mean_token_accuracy": 0.3835458755493164, + "num_tokens": 2581071079.0, + "step": 5046 + }, + { + "epoch": 1.3647917793401838, + "grad_norm": 3.0625, + "learning_rate": 0.01735587399104225, + "loss": 3.253, + "mean_token_accuracy": 0.37956833839416504, + "num_tokens": 2581580288.0, + "step": 5047 + }, + { + "epoch": 1.3650621957815035, + "grad_norm": 3.3125, + "learning_rate": 0.017354757732560306, + "loss": 3.3328, + "mean_token_accuracy": 0.37653642892837524, + "num_tokens": 2582104478.0, + "step": 5048 + }, + { + "epoch": 1.365332612222823, + "grad_norm": 3.4375, + "learning_rate": 0.017353641279095708, + "loss": 3.4394, + "mean_token_accuracy": 0.37354806065559387, + "num_tokens": 2582594224.0, + "step": 5049 + }, + { + "epoch": 1.3656030286641427, + "grad_norm": 3.125, + "learning_rate": 0.017352524630682705, + "loss": 3.3986, + "mean_token_accuracy": 0.39513731002807617, + "num_tokens": 2583118373.0, + "step": 5050 + }, + { + "epoch": 1.3658734451054624, + "grad_norm": 24.5, + "learning_rate": 0.017351407787355576, + "loss": 12.9981, + "mean_token_accuracy": 0.00026169803459197283, + "num_tokens": 2583641395.0, + "step": 5051 + }, + { + "epoch": 1.366143861546782, + "grad_norm": 8.0625, + "learning_rate": 0.01735029074914857, + "loss": 4.1292, + "mean_token_accuracy": 0.2964528799057007, + "num_tokens": 2584096483.0, + "step": 5052 + }, + { + "epoch": 1.3664142779881017, + "grad_norm": 2.28125, + "learning_rate": 0.01734917351609597, + "loss": 3.3039, + "mean_token_accuracy": 0.35500267148017883, + "num_tokens": 2584620674.0, + "step": 5053 + }, + { + "epoch": 1.3666846944294213, + "grad_norm": 2.625, + "learning_rate": 0.017348056088232058, + "loss": 3.3477, + "mean_token_accuracy": 0.33655911684036255, + "num_tokens": 2585144902.0, + "step": 5054 + }, + { + "epoch": 1.366955110870741, + "grad_norm": 3.734375, + "learning_rate": 0.017346938465591114, + "loss": 3.193, + "mean_token_accuracy": 0.39407578110694885, + "num_tokens": 2585669122.0, + "step": 5055 + }, + { + "epoch": 1.3672255273120606, + "grad_norm": 2.734375, + "learning_rate": 0.01734582064820743, + "loss": 3.2174, + "mean_token_accuracy": 0.3786923289299011, + "num_tokens": 2586193309.0, + "step": 5056 + }, + { + "epoch": 1.3674959437533802, + "grad_norm": 2.984375, + "learning_rate": 0.017344702636115313, + "loss": 3.1542, + "mean_token_accuracy": 0.39198726415634155, + "num_tokens": 2586717519.0, + "step": 5057 + }, + { + "epoch": 1.3677663601946999, + "grad_norm": 2.40625, + "learning_rate": 0.017343584429349055, + "loss": 3.2943, + "mean_token_accuracy": 0.3751943111419678, + "num_tokens": 2587241799.0, + "step": 5058 + }, + { + "epoch": 1.3680367766360195, + "grad_norm": 2.671875, + "learning_rate": 0.017342466027942975, + "loss": 3.2795, + "mean_token_accuracy": 0.38471752405166626, + "num_tokens": 2587746059.0, + "step": 5059 + }, + { + "epoch": 1.3683071930773392, + "grad_norm": 3.046875, + "learning_rate": 0.017341347431931383, + "loss": 3.2077, + "mean_token_accuracy": 0.3666277229785919, + "num_tokens": 2588270202.0, + "step": 5060 + }, + { + "epoch": 1.3685776095186588, + "grad_norm": 2.9375, + "learning_rate": 0.017340228641348605, + "loss": 3.2682, + "mean_token_accuracy": 0.3853500783443451, + "num_tokens": 2588794377.0, + "step": 5061 + }, + { + "epoch": 1.3688480259599785, + "grad_norm": 2.703125, + "learning_rate": 0.017339109656228965, + "loss": 3.4543, + "mean_token_accuracy": 0.35280144214630127, + "num_tokens": 2589318651.0, + "step": 5062 + }, + { + "epoch": 1.369118442401298, + "grad_norm": 2.578125, + "learning_rate": 0.017337990476606804, + "loss": 3.2031, + "mean_token_accuracy": 0.3945724368095398, + "num_tokens": 2589842925.0, + "step": 5063 + }, + { + "epoch": 1.3693888588426177, + "grad_norm": 2.890625, + "learning_rate": 0.01733687110251645, + "loss": 3.3311, + "mean_token_accuracy": 0.3659919798374176, + "num_tokens": 2590367165.0, + "step": 5064 + }, + { + "epoch": 1.3696592752839374, + "grad_norm": 3.046875, + "learning_rate": 0.017335751533992253, + "loss": 3.3911, + "mean_token_accuracy": 0.38124698400497437, + "num_tokens": 2590835205.0, + "step": 5065 + }, + { + "epoch": 1.369929691725257, + "grad_norm": 2.734375, + "learning_rate": 0.017334631771068573, + "loss": 3.1778, + "mean_token_accuracy": 0.3974156677722931, + "num_tokens": 2591359464.0, + "step": 5066 + }, + { + "epoch": 1.3702001081665764, + "grad_norm": 2.859375, + "learning_rate": 0.01733351181377976, + "loss": 3.4711, + "mean_token_accuracy": 0.3635847866535187, + "num_tokens": 2591836289.0, + "step": 5067 + }, + { + "epoch": 1.370470524607896, + "grad_norm": 3.09375, + "learning_rate": 0.01733239166216018, + "loss": 3.5505, + "mean_token_accuracy": 0.3730887174606323, + "num_tokens": 2592338365.0, + "step": 5068 + }, + { + "epoch": 1.3707409410492157, + "grad_norm": 3.109375, + "learning_rate": 0.017331271316244196, + "loss": 3.1443, + "mean_token_accuracy": 0.4110774099826813, + "num_tokens": 2592855109.0, + "step": 5069 + }, + { + "epoch": 1.3710113574905354, + "grad_norm": 3.234375, + "learning_rate": 0.017330150776066193, + "loss": 3.2211, + "mean_token_accuracy": 0.3941485285758972, + "num_tokens": 2593350930.0, + "step": 5070 + }, + { + "epoch": 1.371281773931855, + "grad_norm": 29.0, + "learning_rate": 0.01732903004166055, + "loss": 11.2101, + "mean_token_accuracy": 7.23896300769411e-06, + "num_tokens": 2593875142.0, + "step": 5071 + }, + { + "epoch": 1.3715521903731747, + "grad_norm": 8.625, + "learning_rate": 0.01732790911306165, + "loss": 3.8052, + "mean_token_accuracy": 0.3900800049304962, + "num_tokens": 2594304180.0, + "step": 5072 + }, + { + "epoch": 1.3718226068144943, + "grad_norm": 2.59375, + "learning_rate": 0.017326787990303887, + "loss": 3.3181, + "mean_token_accuracy": 0.3869708776473999, + "num_tokens": 2594781260.0, + "step": 5073 + }, + { + "epoch": 1.372093023255814, + "grad_norm": 3.265625, + "learning_rate": 0.017325666673421666, + "loss": 3.3153, + "mean_token_accuracy": 0.37962955236434937, + "num_tokens": 2595305529.0, + "step": 5074 + }, + { + "epoch": 1.3723634396971336, + "grad_norm": 3.078125, + "learning_rate": 0.01732454516244939, + "loss": 3.2993, + "mean_token_accuracy": 0.3761027753353119, + "num_tokens": 2595829739.0, + "step": 5075 + }, + { + "epoch": 1.3726338561384532, + "grad_norm": 3.828125, + "learning_rate": 0.017323423457421466, + "loss": 3.0561, + "mean_token_accuracy": 0.42699068784713745, + "num_tokens": 2596353966.0, + "step": 5076 + }, + { + "epoch": 1.3729042725797729, + "grad_norm": 2.375, + "learning_rate": 0.01732230155837232, + "loss": 3.2492, + "mean_token_accuracy": 0.3830505609512329, + "num_tokens": 2596851658.0, + "step": 5077 + }, + { + "epoch": 1.3731746890210925, + "grad_norm": 2.65625, + "learning_rate": 0.017321179465336362, + "loss": 3.2797, + "mean_token_accuracy": 0.36644643545150757, + "num_tokens": 2597375899.0, + "step": 5078 + }, + { + "epoch": 1.3734451054624122, + "grad_norm": 2.765625, + "learning_rate": 0.017320057178348034, + "loss": 3.3066, + "mean_token_accuracy": 0.37587684392929077, + "num_tokens": 2597899981.0, + "step": 5079 + }, + { + "epoch": 1.3737155219037318, + "grad_norm": 3.375, + "learning_rate": 0.01731893469744176, + "loss": 2.7793, + "mean_token_accuracy": 0.45119258761405945, + "num_tokens": 2598424180.0, + "step": 5080 + }, + { + "epoch": 1.3739859383450514, + "grad_norm": 2.453125, + "learning_rate": 0.017317812022651993, + "loss": 3.3748, + "mean_token_accuracy": 0.3752497434616089, + "num_tokens": 2598948395.0, + "step": 5081 + }, + { + "epoch": 1.374256354786371, + "grad_norm": 4.0625, + "learning_rate": 0.01731668915401317, + "loss": 3.3309, + "mean_token_accuracy": 0.36734962463378906, + "num_tokens": 2599467328.0, + "step": 5082 + }, + { + "epoch": 1.3745267712276905, + "grad_norm": 3.1875, + "learning_rate": 0.01731556609155975, + "loss": 3.4223, + "mean_token_accuracy": 0.3794385492801666, + "num_tokens": 2599991596.0, + "step": 5083 + }, + { + "epoch": 1.3747971876690102, + "grad_norm": 3.59375, + "learning_rate": 0.017314442835326186, + "loss": 3.5549, + "mean_token_accuracy": 0.36024612188339233, + "num_tokens": 2600515877.0, + "step": 5084 + }, + { + "epoch": 1.3750676041103298, + "grad_norm": 3.796875, + "learning_rate": 0.01731331938534695, + "loss": 3.1134, + "mean_token_accuracy": 0.40675830841064453, + "num_tokens": 2600984983.0, + "step": 5085 + }, + { + "epoch": 1.3753380205516494, + "grad_norm": 2.40625, + "learning_rate": 0.01731219574165651, + "loss": 3.0497, + "mean_token_accuracy": 0.41002923250198364, + "num_tokens": 2601509107.0, + "step": 5086 + }, + { + "epoch": 1.375608436992969, + "grad_norm": 3.015625, + "learning_rate": 0.01731107190428934, + "loss": 3.4155, + "mean_token_accuracy": 0.3642873764038086, + "num_tokens": 2602033333.0, + "step": 5087 + }, + { + "epoch": 1.3758788534342887, + "grad_norm": 2.78125, + "learning_rate": 0.01730994787327992, + "loss": 3.442, + "mean_token_accuracy": 0.37593019008636475, + "num_tokens": 2602557563.0, + "step": 5088 + }, + { + "epoch": 1.3761492698756084, + "grad_norm": 3.796875, + "learning_rate": 0.017308823648662746, + "loss": 3.4162, + "mean_token_accuracy": 0.3619083762168884, + "num_tokens": 2603070893.0, + "step": 5089 + }, + { + "epoch": 1.376419686316928, + "grad_norm": 3.0625, + "learning_rate": 0.01730769923047231, + "loss": 3.4297, + "mean_token_accuracy": 0.36320391297340393, + "num_tokens": 2603595116.0, + "step": 5090 + }, + { + "epoch": 1.3766901027582477, + "grad_norm": 72.5, + "learning_rate": 0.017306574618743114, + "loss": 11.8387, + "mean_token_accuracy": 0.009816288948059082, + "num_tokens": 2604119322.0, + "step": 5091 + }, + { + "epoch": 1.3769605191995673, + "grad_norm": 6.65625, + "learning_rate": 0.017305449813509656, + "loss": 4.0261, + "mean_token_accuracy": 0.30534684658050537, + "num_tokens": 2604643499.0, + "step": 5092 + }, + { + "epoch": 1.377230935640887, + "grad_norm": 2.140625, + "learning_rate": 0.01730432481480646, + "loss": 3.5852, + "mean_token_accuracy": 0.36751967668533325, + "num_tokens": 2605131425.0, + "step": 5093 + }, + { + "epoch": 1.3775013520822066, + "grad_norm": 3.140625, + "learning_rate": 0.017303199622668037, + "loss": 3.2281, + "mean_token_accuracy": 0.3829883337020874, + "num_tokens": 2605655507.0, + "step": 5094 + }, + { + "epoch": 1.3777717685235262, + "grad_norm": 3.203125, + "learning_rate": 0.017302074237128914, + "loss": 3.4358, + "mean_token_accuracy": 0.3605267405509949, + "num_tokens": 2606179729.0, + "step": 5095 + }, + { + "epoch": 1.3780421849648459, + "grad_norm": 3.15625, + "learning_rate": 0.017300948658223618, + "loss": 3.4941, + "mean_token_accuracy": 0.3662525713443756, + "num_tokens": 2606668637.0, + "step": 5096 + }, + { + "epoch": 1.3783126014061655, + "grad_norm": 3.09375, + "learning_rate": 0.01729982288598669, + "loss": 3.4265, + "mean_token_accuracy": 0.3909699320793152, + "num_tokens": 2607140337.0, + "step": 5097 + }, + { + "epoch": 1.3785830178474852, + "grad_norm": 3.265625, + "learning_rate": 0.017298696920452668, + "loss": 3.35, + "mean_token_accuracy": 0.38177967071533203, + "num_tokens": 2607628879.0, + "step": 5098 + }, + { + "epoch": 1.3788534342888048, + "grad_norm": 2.78125, + "learning_rate": 0.0172975707616561, + "loss": 3.3782, + "mean_token_accuracy": 0.38357651233673096, + "num_tokens": 2608153049.0, + "step": 5099 + }, + { + "epoch": 1.3791238507301244, + "grad_norm": 3.15625, + "learning_rate": 0.017296444409631545, + "loss": 3.2768, + "mean_token_accuracy": 0.3849463164806366, + "num_tokens": 2608677186.0, + "step": 5100 + }, + { + "epoch": 1.379394267171444, + "grad_norm": 3.0625, + "learning_rate": 0.017295317864413556, + "loss": 3.3716, + "mean_token_accuracy": 0.3771378695964813, + "num_tokens": 2609181729.0, + "step": 5101 + }, + { + "epoch": 1.3796646836127637, + "grad_norm": 3.03125, + "learning_rate": 0.0172941911260367, + "loss": 3.378, + "mean_token_accuracy": 0.40889161825180054, + "num_tokens": 2609643390.0, + "step": 5102 + }, + { + "epoch": 1.3799351000540834, + "grad_norm": 3.640625, + "learning_rate": 0.017293064194535555, + "loss": 3.2237, + "mean_token_accuracy": 0.3984842002391815, + "num_tokens": 2610167551.0, + "step": 5103 + }, + { + "epoch": 1.380205516495403, + "grad_norm": 3.484375, + "learning_rate": 0.01729193706994469, + "loss": 3.4165, + "mean_token_accuracy": 0.375771164894104, + "num_tokens": 2610647850.0, + "step": 5104 + }, + { + "epoch": 1.3804759329367227, + "grad_norm": 2.96875, + "learning_rate": 0.017290809752298693, + "loss": 3.3358, + "mean_token_accuracy": 0.3780476450920105, + "num_tokens": 2611172102.0, + "step": 5105 + }, + { + "epoch": 1.3807463493780423, + "grad_norm": 3.75, + "learning_rate": 0.017289682241632152, + "loss": 3.4087, + "mean_token_accuracy": 0.335917592048645, + "num_tokens": 2611696380.0, + "step": 5106 + }, + { + "epoch": 1.381016765819362, + "grad_norm": 2.734375, + "learning_rate": 0.017288554537979665, + "loss": 3.058, + "mean_token_accuracy": 0.39426225423812866, + "num_tokens": 2612167095.0, + "step": 5107 + }, + { + "epoch": 1.3812871822606814, + "grad_norm": 3.0, + "learning_rate": 0.01728742664137583, + "loss": 3.3002, + "mean_token_accuracy": 0.3871069550514221, + "num_tokens": 2612691363.0, + "step": 5108 + }, + { + "epoch": 1.381557598702001, + "grad_norm": 2.890625, + "learning_rate": 0.017286298551855254, + "loss": 3.265, + "mean_token_accuracy": 0.3970935344696045, + "num_tokens": 2613215471.0, + "step": 5109 + }, + { + "epoch": 1.3818280151433207, + "grad_norm": 3.140625, + "learning_rate": 0.017285170269452556, + "loss": 3.2215, + "mean_token_accuracy": 0.39720091223716736, + "num_tokens": 2613710285.0, + "step": 5110 + }, + { + "epoch": 1.3820984315846403, + "grad_norm": 26.0, + "learning_rate": 0.017284041794202346, + "loss": 11.6943, + "mean_token_accuracy": 0.0, + "num_tokens": 2614229707.0, + "step": 5111 + }, + { + "epoch": 1.38236884802596, + "grad_norm": 8.5625, + "learning_rate": 0.017282913126139256, + "loss": 3.8871, + "mean_token_accuracy": 0.31221532821655273, + "num_tokens": 2614753965.0, + "step": 5112 + }, + { + "epoch": 1.3826392644672796, + "grad_norm": 2.21875, + "learning_rate": 0.017281784265297917, + "loss": 3.5299, + "mean_token_accuracy": 0.3666805028915405, + "num_tokens": 2615266825.0, + "step": 5113 + }, + { + "epoch": 1.3829096809085992, + "grad_norm": 2.578125, + "learning_rate": 0.017280655211712962, + "loss": 3.2475, + "mean_token_accuracy": 0.3904290497303009, + "num_tokens": 2615767818.0, + "step": 5114 + }, + { + "epoch": 1.3831800973499189, + "grad_norm": 2.90625, + "learning_rate": 0.01727952596541903, + "loss": 3.3371, + "mean_token_accuracy": 0.3818165063858032, + "num_tokens": 2616256918.0, + "step": 5115 + }, + { + "epoch": 1.3834505137912385, + "grad_norm": 2.6875, + "learning_rate": 0.01727839652645078, + "loss": 3.3098, + "mean_token_accuracy": 0.3865682780742645, + "num_tokens": 2616781107.0, + "step": 5116 + }, + { + "epoch": 1.3837209302325582, + "grad_norm": 2.921875, + "learning_rate": 0.01727726689484286, + "loss": 3.3861, + "mean_token_accuracy": 0.3716709017753601, + "num_tokens": 2617305263.0, + "step": 5117 + }, + { + "epoch": 1.3839913466738778, + "grad_norm": 2.671875, + "learning_rate": 0.017276137070629926, + "loss": 3.3442, + "mean_token_accuracy": 0.3777770698070526, + "num_tokens": 2617829365.0, + "step": 5118 + }, + { + "epoch": 1.3842617631151974, + "grad_norm": 3.171875, + "learning_rate": 0.017275007053846655, + "loss": 3.2824, + "mean_token_accuracy": 0.38882702589035034, + "num_tokens": 2618301965.0, + "step": 5119 + }, + { + "epoch": 1.384532179556517, + "grad_norm": 2.140625, + "learning_rate": 0.017273876844527716, + "loss": 3.3893, + "mean_token_accuracy": 0.3520149290561676, + "num_tokens": 2618791566.0, + "step": 5120 + }, + { + "epoch": 1.3848025959978367, + "grad_norm": 2.53125, + "learning_rate": 0.017272746442707782, + "loss": 3.3578, + "mean_token_accuracy": 0.3518136739730835, + "num_tokens": 2619315834.0, + "step": 5121 + }, + { + "epoch": 1.3850730124391564, + "grad_norm": 3.5625, + "learning_rate": 0.01727161584842154, + "loss": 3.4375, + "mean_token_accuracy": 0.3829137086868286, + "num_tokens": 2619839986.0, + "step": 5122 + }, + { + "epoch": 1.385343428880476, + "grad_norm": 2.6875, + "learning_rate": 0.01727048506170368, + "loss": 3.3394, + "mean_token_accuracy": 0.374165415763855, + "num_tokens": 2620364194.0, + "step": 5123 + }, + { + "epoch": 1.3856138453217954, + "grad_norm": 2.796875, + "learning_rate": 0.017269354082588898, + "loss": 3.3621, + "mean_token_accuracy": 0.3941704034805298, + "num_tokens": 2620888481.0, + "step": 5124 + }, + { + "epoch": 1.385884261763115, + "grad_norm": 3.828125, + "learning_rate": 0.017268222911111897, + "loss": 3.1833, + "mean_token_accuracy": 0.3842300772666931, + "num_tokens": 2621412758.0, + "step": 5125 + }, + { + "epoch": 1.3861546782044347, + "grad_norm": 2.1875, + "learning_rate": 0.017267091547307385, + "loss": 3.3118, + "mean_token_accuracy": 0.38123422861099243, + "num_tokens": 2621937022.0, + "step": 5126 + }, + { + "epoch": 1.3864250946457544, + "grad_norm": 2.796875, + "learning_rate": 0.01726595999121007, + "loss": 3.3902, + "mean_token_accuracy": 0.3968704044818878, + "num_tokens": 2622461299.0, + "step": 5127 + }, + { + "epoch": 1.386695511087074, + "grad_norm": 3.203125, + "learning_rate": 0.017264828242854677, + "loss": 3.399, + "mean_token_accuracy": 0.37247300148010254, + "num_tokens": 2622907692.0, + "step": 5128 + }, + { + "epoch": 1.3869659275283936, + "grad_norm": 2.859375, + "learning_rate": 0.01726369630227593, + "loss": 3.4123, + "mean_token_accuracy": 0.3878272771835327, + "num_tokens": 2623374135.0, + "step": 5129 + }, + { + "epoch": 1.3872363439697133, + "grad_norm": 2.71875, + "learning_rate": 0.01726256416950856, + "loss": 3.3022, + "mean_token_accuracy": 0.35188913345336914, + "num_tokens": 2623898378.0, + "step": 5130 + }, + { + "epoch": 1.387506760411033, + "grad_norm": 93.0, + "learning_rate": 0.017261431844587304, + "loss": 11.1445, + "mean_token_accuracy": 0.008075917139649391, + "num_tokens": 2624422594.0, + "step": 5131 + }, + { + "epoch": 1.3877771768523526, + "grad_norm": 6.25, + "learning_rate": 0.017260299327546906, + "loss": 3.6362, + "mean_token_accuracy": 0.3647286295890808, + "num_tokens": 2624946784.0, + "step": 5132 + }, + { + "epoch": 1.3880475932936722, + "grad_norm": 3.015625, + "learning_rate": 0.01725916661842212, + "loss": 3.2981, + "mean_token_accuracy": 0.38237887620925903, + "num_tokens": 2625470992.0, + "step": 5133 + }, + { + "epoch": 1.3883180097349919, + "grad_norm": 4.21875, + "learning_rate": 0.017258033717247684, + "loss": 3.4501, + "mean_token_accuracy": 0.34441232681274414, + "num_tokens": 2625991487.0, + "step": 5134 + }, + { + "epoch": 1.3885884261763115, + "grad_norm": 3.109375, + "learning_rate": 0.017256900624058377, + "loss": 3.434, + "mean_token_accuracy": 0.36094436049461365, + "num_tokens": 2626513344.0, + "step": 5135 + }, + { + "epoch": 1.3888588426176312, + "grad_norm": 3.765625, + "learning_rate": 0.017255767338888957, + "loss": 3.2763, + "mean_token_accuracy": 0.3724386990070343, + "num_tokens": 2627037593.0, + "step": 5136 + }, + { + "epoch": 1.3891292590589508, + "grad_norm": 3.4375, + "learning_rate": 0.017254633861774196, + "loss": 3.3387, + "mean_token_accuracy": 0.396337628364563, + "num_tokens": 2627516260.0, + "step": 5137 + }, + { + "epoch": 1.3893996755002704, + "grad_norm": 3.453125, + "learning_rate": 0.017253500192748877, + "loss": 3.4631, + "mean_token_accuracy": 0.3690786361694336, + "num_tokens": 2628026389.0, + "step": 5138 + }, + { + "epoch": 1.38967009194159, + "grad_norm": 2.140625, + "learning_rate": 0.01725236633184778, + "loss": 2.9769, + "mean_token_accuracy": 0.4139534533023834, + "num_tokens": 2628476271.0, + "step": 5139 + }, + { + "epoch": 1.3899405083829097, + "grad_norm": 3.890625, + "learning_rate": 0.0172512322791057, + "loss": 2.8196, + "mean_token_accuracy": 0.407215416431427, + "num_tokens": 2628987828.0, + "step": 5140 + }, + { + "epoch": 1.3902109248242294, + "grad_norm": 2.4375, + "learning_rate": 0.01725009803455743, + "loss": 3.1977, + "mean_token_accuracy": 0.38619208335876465, + "num_tokens": 2629512002.0, + "step": 5141 + }, + { + "epoch": 1.390481341265549, + "grad_norm": 3.5, + "learning_rate": 0.017248963598237767, + "loss": 3.5259, + "mean_token_accuracy": 0.33119508624076843, + "num_tokens": 2630036184.0, + "step": 5142 + }, + { + "epoch": 1.3907517577068687, + "grad_norm": 3.96875, + "learning_rate": 0.017247828970181525, + "loss": 3.0254, + "mean_token_accuracy": 0.40841299295425415, + "num_tokens": 2630560399.0, + "step": 5143 + }, + { + "epoch": 1.3910221741481883, + "grad_norm": 2.1875, + "learning_rate": 0.017246694150423517, + "loss": 3.1438, + "mean_token_accuracy": 0.3881418704986572, + "num_tokens": 2631084650.0, + "step": 5144 + }, + { + "epoch": 1.391292590589508, + "grad_norm": 2.640625, + "learning_rate": 0.017245559138998563, + "loss": 3.2659, + "mean_token_accuracy": 0.3769344687461853, + "num_tokens": 2631608920.0, + "step": 5145 + }, + { + "epoch": 1.3915630070308276, + "grad_norm": 25.25, + "learning_rate": 0.017244423935941493, + "loss": 3.3167, + "mean_token_accuracy": 0.3759099841117859, + "num_tokens": 2632133142.0, + "step": 5146 + }, + { + "epoch": 1.3918334234721472, + "grad_norm": 4.03125, + "learning_rate": 0.017243288541287122, + "loss": 3.3875, + "mean_token_accuracy": 0.38135749101638794, + "num_tokens": 2632609676.0, + "step": 5147 + }, + { + "epoch": 1.3921038399134669, + "grad_norm": 2.90625, + "learning_rate": 0.017242152955070308, + "loss": 3.4621, + "mean_token_accuracy": 0.354303777217865, + "num_tokens": 2633133908.0, + "step": 5148 + }, + { + "epoch": 1.3923742563547863, + "grad_norm": 2.984375, + "learning_rate": 0.01724101717732588, + "loss": 3.3356, + "mean_token_accuracy": 0.3669448494911194, + "num_tokens": 2633658136.0, + "step": 5149 + }, + { + "epoch": 1.392644672796106, + "grad_norm": 2.84375, + "learning_rate": 0.017239881208088694, + "loss": 3.3561, + "mean_token_accuracy": 0.38112449645996094, + "num_tokens": 2634182417.0, + "step": 5150 + }, + { + "epoch": 1.3929150892374256, + "grad_norm": 4.25, + "learning_rate": 0.0172387450473936, + "loss": 9.9802, + "mean_token_accuracy": 0.008499990217387676, + "num_tokens": 2634706582.0, + "step": 5151 + }, + { + "epoch": 1.3931855056787452, + "grad_norm": 7.71875, + "learning_rate": 0.017237608695275457, + "loss": 3.8882, + "mean_token_accuracy": 0.2839100658893585, + "num_tokens": 2635230839.0, + "step": 5152 + }, + { + "epoch": 1.3934559221200649, + "grad_norm": 2.5, + "learning_rate": 0.01723647215176914, + "loss": 3.6769, + "mean_token_accuracy": 0.3454994559288025, + "num_tokens": 2635755091.0, + "step": 5153 + }, + { + "epoch": 1.3937263385613845, + "grad_norm": 3.328125, + "learning_rate": 0.017235335416909518, + "loss": 3.3941, + "mean_token_accuracy": 0.3555675148963928, + "num_tokens": 2636279308.0, + "step": 5154 + }, + { + "epoch": 1.3939967550027041, + "grad_norm": 3.4375, + "learning_rate": 0.01723419849073147, + "loss": 3.4282, + "mean_token_accuracy": 0.36644691228866577, + "num_tokens": 2636803574.0, + "step": 5155 + }, + { + "epoch": 1.3942671714440238, + "grad_norm": 3.015625, + "learning_rate": 0.017233061373269874, + "loss": 3.3006, + "mean_token_accuracy": 0.38321930170059204, + "num_tokens": 2637327742.0, + "step": 5156 + }, + { + "epoch": 1.3945375878853434, + "grad_norm": 3.078125, + "learning_rate": 0.017231924064559624, + "loss": 3.178, + "mean_token_accuracy": 0.3712103068828583, + "num_tokens": 2637827843.0, + "step": 5157 + }, + { + "epoch": 1.394808004326663, + "grad_norm": 2.75, + "learning_rate": 0.01723078656463562, + "loss": 3.4171, + "mean_token_accuracy": 0.4079386591911316, + "num_tokens": 2638289030.0, + "step": 5158 + }, + { + "epoch": 1.3950784207679827, + "grad_norm": 2.890625, + "learning_rate": 0.01722964887353276, + "loss": 3.4757, + "mean_token_accuracy": 0.3686130940914154, + "num_tokens": 2638813259.0, + "step": 5159 + }, + { + "epoch": 1.3953488372093024, + "grad_norm": 3.875, + "learning_rate": 0.017228510991285955, + "loss": 3.3971, + "mean_token_accuracy": 0.3772162199020386, + "num_tokens": 2639293852.0, + "step": 5160 + }, + { + "epoch": 1.395619253650622, + "grad_norm": 3.671875, + "learning_rate": 0.01722737291793011, + "loss": 3.2946, + "mean_token_accuracy": 0.36780956387519836, + "num_tokens": 2639817933.0, + "step": 5161 + }, + { + "epoch": 1.3958896700919416, + "grad_norm": 3.0, + "learning_rate": 0.017226234653500153, + "loss": 3.5527, + "mean_token_accuracy": 0.3395659625530243, + "num_tokens": 2640342085.0, + "step": 5162 + }, + { + "epoch": 1.3961600865332613, + "grad_norm": 3.265625, + "learning_rate": 0.01722509619803101, + "loss": 3.6291, + "mean_token_accuracy": 0.3812710642814636, + "num_tokens": 2640866352.0, + "step": 5163 + }, + { + "epoch": 1.396430502974581, + "grad_norm": 2.5625, + "learning_rate": 0.017223957551557607, + "loss": 3.6131, + "mean_token_accuracy": 0.328951358795166, + "num_tokens": 2641341243.0, + "step": 5164 + }, + { + "epoch": 1.3967009194159004, + "grad_norm": 3.234375, + "learning_rate": 0.017222818714114883, + "loss": 3.4188, + "mean_token_accuracy": 0.39807426929473877, + "num_tokens": 2641801240.0, + "step": 5165 + }, + { + "epoch": 1.39697133585722, + "grad_norm": 2.40625, + "learning_rate": 0.017221679685737783, + "loss": 3.3917, + "mean_token_accuracy": 0.3712563216686249, + "num_tokens": 2642325463.0, + "step": 5166 + }, + { + "epoch": 1.3972417522985396, + "grad_norm": 3.046875, + "learning_rate": 0.017220540466461248, + "loss": 3.4079, + "mean_token_accuracy": 0.3750994801521301, + "num_tokens": 2642849727.0, + "step": 5167 + }, + { + "epoch": 1.3975121687398593, + "grad_norm": 3.359375, + "learning_rate": 0.017219401056320242, + "loss": 3.2326, + "mean_token_accuracy": 0.3944454789161682, + "num_tokens": 2643373996.0, + "step": 5168 + }, + { + "epoch": 1.397782585181179, + "grad_norm": 4.53125, + "learning_rate": 0.01721826145534972, + "loss": 3.2722, + "mean_token_accuracy": 0.3682766556739807, + "num_tokens": 2643811179.0, + "step": 5169 + }, + { + "epoch": 1.3980530016224986, + "grad_norm": 2.390625, + "learning_rate": 0.01721712166358465, + "loss": 3.4286, + "mean_token_accuracy": 0.3856047987937927, + "num_tokens": 2644335440.0, + "step": 5170 + }, + { + "epoch": 1.3983234180638182, + "grad_norm": 1.3125, + "learning_rate": 0.017215981681060002, + "loss": 11.1773, + "mean_token_accuracy": 0.0, + "num_tokens": 2644859567.0, + "step": 5171 + }, + { + "epoch": 1.3985938345051379, + "grad_norm": 8.3125, + "learning_rate": 0.01721484150781076, + "loss": 4.2329, + "mean_token_accuracy": 0.27893590927124023, + "num_tokens": 2645383612.0, + "step": 5172 + }, + { + "epoch": 1.3988642509464575, + "grad_norm": 3.453125, + "learning_rate": 0.017213701143871902, + "loss": 3.7081, + "mean_token_accuracy": 0.33372431993484497, + "num_tokens": 2645870391.0, + "step": 5173 + }, + { + "epoch": 1.3991346673877771, + "grad_norm": 2.609375, + "learning_rate": 0.01721256058927842, + "loss": 3.4854, + "mean_token_accuracy": 0.3579821288585663, + "num_tokens": 2646394489.0, + "step": 5174 + }, + { + "epoch": 1.3994050838290968, + "grad_norm": 3.9375, + "learning_rate": 0.017211419844065306, + "loss": 3.1411, + "mean_token_accuracy": 0.4090102016925812, + "num_tokens": 2646918550.0, + "step": 5175 + }, + { + "epoch": 1.3996755002704164, + "grad_norm": 2.609375, + "learning_rate": 0.017210278908267568, + "loss": 3.4204, + "mean_token_accuracy": 0.3757810592651367, + "num_tokens": 2647438651.0, + "step": 5176 + }, + { + "epoch": 1.399945916711736, + "grad_norm": 3.546875, + "learning_rate": 0.01720913778192021, + "loss": 3.5649, + "mean_token_accuracy": 0.3550932705402374, + "num_tokens": 2647903721.0, + "step": 5177 + }, + { + "epoch": 1.4002163331530557, + "grad_norm": 2.375, + "learning_rate": 0.017207996465058245, + "loss": 3.4935, + "mean_token_accuracy": 0.36930200457572937, + "num_tokens": 2648395258.0, + "step": 5178 + }, + { + "epoch": 1.4004867495943754, + "grad_norm": 3.421875, + "learning_rate": 0.017206854957716688, + "loss": 3.4971, + "mean_token_accuracy": 0.3636336624622345, + "num_tokens": 2648917188.0, + "step": 5179 + }, + { + "epoch": 1.400757166035695, + "grad_norm": 2.28125, + "learning_rate": 0.01720571325993057, + "loss": 3.186, + "mean_token_accuracy": 0.38569772243499756, + "num_tokens": 2649441343.0, + "step": 5180 + }, + { + "epoch": 1.4010275824770146, + "grad_norm": 3.3125, + "learning_rate": 0.017204571371734922, + "loss": 3.4676, + "mean_token_accuracy": 0.358327180147171, + "num_tokens": 2649965581.0, + "step": 5181 + }, + { + "epoch": 1.4012979989183343, + "grad_norm": 2.96875, + "learning_rate": 0.017203429293164776, + "loss": 3.2262, + "mean_token_accuracy": 0.39250674843788147, + "num_tokens": 2650489837.0, + "step": 5182 + }, + { + "epoch": 1.401568415359654, + "grad_norm": 3.078125, + "learning_rate": 0.017202287024255174, + "loss": 3.3281, + "mean_token_accuracy": 0.3687656819820404, + "num_tokens": 2651014022.0, + "step": 5183 + }, + { + "epoch": 1.4018388318009736, + "grad_norm": 2.734375, + "learning_rate": 0.01720114456504117, + "loss": 3.4679, + "mean_token_accuracy": 0.3787553608417511, + "num_tokens": 2651501565.0, + "step": 5184 + }, + { + "epoch": 1.4021092482422932, + "grad_norm": 3.125, + "learning_rate": 0.01720000191555781, + "loss": 3.5361, + "mean_token_accuracy": 0.362164169549942, + "num_tokens": 2652025820.0, + "step": 5185 + }, + { + "epoch": 1.4023796646836129, + "grad_norm": 2.5, + "learning_rate": 0.017198859075840164, + "loss": 3.3267, + "mean_token_accuracy": 0.38270702958106995, + "num_tokens": 2652549976.0, + "step": 5186 + }, + { + "epoch": 1.4026500811249325, + "grad_norm": 3.359375, + "learning_rate": 0.017197716045923288, + "loss": 3.2675, + "mean_token_accuracy": 0.3900737166404724, + "num_tokens": 2653005716.0, + "step": 5187 + }, + { + "epoch": 1.4029204975662521, + "grad_norm": 2.359375, + "learning_rate": 0.01719657282584226, + "loss": 3.3451, + "mean_token_accuracy": 0.3687029480934143, + "num_tokens": 2653529980.0, + "step": 5188 + }, + { + "epoch": 1.4031909140075718, + "grad_norm": 2.71875, + "learning_rate": 0.017195429415632152, + "loss": 3.1588, + "mean_token_accuracy": 0.3894006311893463, + "num_tokens": 2654054174.0, + "step": 5189 + }, + { + "epoch": 1.4034613304488914, + "grad_norm": 2.4375, + "learning_rate": 0.01719428581532805, + "loss": 3.248, + "mean_token_accuracy": 0.3837641775608063, + "num_tokens": 2654578375.0, + "step": 5190 + }, + { + "epoch": 1.4037317468902109, + "grad_norm": 3.578125, + "learning_rate": 0.017193142024965045, + "loss": 10.1161, + "mean_token_accuracy": 6.882339948788285e-05, + "num_tokens": 2655072456.0, + "step": 5191 + }, + { + "epoch": 1.4040021633315305, + "grad_norm": 6.75, + "learning_rate": 0.01719199804457823, + "loss": 3.6309, + "mean_token_accuracy": 0.34936249256134033, + "num_tokens": 2655555198.0, + "step": 5192 + }, + { + "epoch": 1.4042725797728501, + "grad_norm": 3.09375, + "learning_rate": 0.017190853874202706, + "loss": 3.601, + "mean_token_accuracy": 0.36115533113479614, + "num_tokens": 2656079403.0, + "step": 5193 + }, + { + "epoch": 1.4045429962141698, + "grad_norm": 3.78125, + "learning_rate": 0.017189709513873577, + "loss": 3.2961, + "mean_token_accuracy": 0.3733070492744446, + "num_tokens": 2656603663.0, + "step": 5194 + }, + { + "epoch": 1.4048134126554894, + "grad_norm": 2.375, + "learning_rate": 0.017188564963625957, + "loss": 3.5155, + "mean_token_accuracy": 0.36747124791145325, + "num_tokens": 2657127800.0, + "step": 5195 + }, + { + "epoch": 1.405083829096809, + "grad_norm": 3.421875, + "learning_rate": 0.017187420223494964, + "loss": 3.5723, + "mean_token_accuracy": 0.39516493678092957, + "num_tokens": 2657528701.0, + "step": 5196 + }, + { + "epoch": 1.4053542455381287, + "grad_norm": 3.3125, + "learning_rate": 0.017186275293515723, + "loss": 3.2389, + "mean_token_accuracy": 0.3938477635383606, + "num_tokens": 2658052837.0, + "step": 5197 + }, + { + "epoch": 1.4056246619794484, + "grad_norm": 3.046875, + "learning_rate": 0.017185130173723365, + "loss": 3.3716, + "mean_token_accuracy": 0.4202273488044739, + "num_tokens": 2658422953.0, + "step": 5198 + }, + { + "epoch": 1.405895078420768, + "grad_norm": 2.734375, + "learning_rate": 0.017183984864153023, + "loss": 3.4236, + "mean_token_accuracy": 0.3992367684841156, + "num_tokens": 2658905203.0, + "step": 5199 + }, + { + "epoch": 1.4061654948620876, + "grad_norm": 3.84375, + "learning_rate": 0.01718283936483984, + "loss": 3.2051, + "mean_token_accuracy": 0.3380342125892639, + "num_tokens": 2659429463.0, + "step": 5200 + }, + { + "epoch": 1.4064359113034073, + "grad_norm": 2.953125, + "learning_rate": 0.017181693675818965, + "loss": 3.2217, + "mean_token_accuracy": 0.3942938446998596, + "num_tokens": 2659953645.0, + "step": 5201 + }, + { + "epoch": 1.406706327744727, + "grad_norm": 3.21875, + "learning_rate": 0.017180547797125544, + "loss": 3.5079, + "mean_token_accuracy": 0.36316680908203125, + "num_tokens": 2660477923.0, + "step": 5202 + }, + { + "epoch": 1.4069767441860466, + "grad_norm": 3.40625, + "learning_rate": 0.017179401728794744, + "loss": 3.331, + "mean_token_accuracy": 0.40695297718048096, + "num_tokens": 2660989263.0, + "step": 5203 + }, + { + "epoch": 1.4072471606273662, + "grad_norm": 3.34375, + "learning_rate": 0.017178255470861725, + "loss": 3.3229, + "mean_token_accuracy": 0.3650476038455963, + "num_tokens": 2661513531.0, + "step": 5204 + }, + { + "epoch": 1.4075175770686859, + "grad_norm": 2.65625, + "learning_rate": 0.01717710902336166, + "loss": 3.1578, + "mean_token_accuracy": 0.40208321809768677, + "num_tokens": 2662037585.0, + "step": 5205 + }, + { + "epoch": 1.4077879935100053, + "grad_norm": 3.609375, + "learning_rate": 0.01717596238632972, + "loss": 3.232, + "mean_token_accuracy": 0.370048850774765, + "num_tokens": 2662561868.0, + "step": 5206 + }, + { + "epoch": 1.408058409951325, + "grad_norm": 2.9375, + "learning_rate": 0.017174815559801097, + "loss": 3.3619, + "mean_token_accuracy": 0.3916485905647278, + "num_tokens": 2663026024.0, + "step": 5207 + }, + { + "epoch": 1.4083288263926446, + "grad_norm": 3.328125, + "learning_rate": 0.01717366854381097, + "loss": 3.3202, + "mean_token_accuracy": 0.38026270270347595, + "num_tokens": 2663550202.0, + "step": 5208 + }, + { + "epoch": 1.4085992428339642, + "grad_norm": 2.796875, + "learning_rate": 0.01717252133839454, + "loss": 3.0861, + "mean_token_accuracy": 0.38952022790908813, + "num_tokens": 2664040215.0, + "step": 5209 + }, + { + "epoch": 1.4088696592752838, + "grad_norm": 2.625, + "learning_rate": 0.017171373943587004, + "loss": 2.9817, + "mean_token_accuracy": 0.402139812707901, + "num_tokens": 2664564294.0, + "step": 5210 + }, + { + "epoch": 1.4091400757166035, + "grad_norm": 45.25, + "learning_rate": 0.017170226359423564, + "loss": 12.0565, + "mean_token_accuracy": 0.03471074253320694, + "num_tokens": 2665088516.0, + "step": 5211 + }, + { + "epoch": 1.4094104921579231, + "grad_norm": 6.4375, + "learning_rate": 0.017169078585939437, + "loss": 3.8186, + "mean_token_accuracy": 0.33161744475364685, + "num_tokens": 2665612738.0, + "step": 5212 + }, + { + "epoch": 1.4096809085992428, + "grad_norm": 2.09375, + "learning_rate": 0.017167930623169833, + "loss": 3.4451, + "mean_token_accuracy": 0.3503192365169525, + "num_tokens": 2666136867.0, + "step": 5213 + }, + { + "epoch": 1.4099513250405624, + "grad_norm": 2.96875, + "learning_rate": 0.017166782471149982, + "loss": 3.5376, + "mean_token_accuracy": 0.36438632011413574, + "num_tokens": 2666661045.0, + "step": 5214 + }, + { + "epoch": 1.410221741481882, + "grad_norm": 3.0625, + "learning_rate": 0.017165634129915106, + "loss": 3.4883, + "mean_token_accuracy": 0.34299805760383606, + "num_tokens": 2667185224.0, + "step": 5215 + }, + { + "epoch": 1.4104921579232017, + "grad_norm": 3.640625, + "learning_rate": 0.017164485599500446, + "loss": 3.3678, + "mean_token_accuracy": 0.37446510791778564, + "num_tokens": 2667693262.0, + "step": 5216 + }, + { + "epoch": 1.4107625743645213, + "grad_norm": 2.515625, + "learning_rate": 0.017163336879941232, + "loss": 3.3358, + "mean_token_accuracy": 0.3807569146156311, + "num_tokens": 2668165250.0, + "step": 5217 + }, + { + "epoch": 1.411032990805841, + "grad_norm": 3.8125, + "learning_rate": 0.017162187971272728, + "loss": 3.4479, + "mean_token_accuracy": 0.34997567534446716, + "num_tokens": 2668683837.0, + "step": 5218 + }, + { + "epoch": 1.4113034072471606, + "grad_norm": 3.234375, + "learning_rate": 0.017161038873530172, + "loss": 3.3824, + "mean_token_accuracy": 0.3625965714454651, + "num_tokens": 2669208089.0, + "step": 5219 + }, + { + "epoch": 1.4115738236884803, + "grad_norm": 4.15625, + "learning_rate": 0.017159889586748824, + "loss": 3.3513, + "mean_token_accuracy": 0.37695807218551636, + "num_tokens": 2669732365.0, + "step": 5220 + }, + { + "epoch": 1.4118442401298, + "grad_norm": 3.859375, + "learning_rate": 0.017158740110963945, + "loss": 3.3343, + "mean_token_accuracy": 0.3558692932128906, + "num_tokens": 2670256617.0, + "step": 5221 + }, + { + "epoch": 1.4121146565711196, + "grad_norm": 3.65625, + "learning_rate": 0.01715759044621081, + "loss": 3.466, + "mean_token_accuracy": 0.37270355224609375, + "num_tokens": 2670780791.0, + "step": 5222 + }, + { + "epoch": 1.4123850730124392, + "grad_norm": 2.8125, + "learning_rate": 0.01715644059252469, + "loss": 3.2076, + "mean_token_accuracy": 0.37519872188568115, + "num_tokens": 2671305043.0, + "step": 5223 + }, + { + "epoch": 1.4126554894537589, + "grad_norm": 2.84375, + "learning_rate": 0.01715529054994087, + "loss": 3.4961, + "mean_token_accuracy": 0.3643055558204651, + "num_tokens": 2671829272.0, + "step": 5224 + }, + { + "epoch": 1.4129259058950785, + "grad_norm": 3.265625, + "learning_rate": 0.017154140318494635, + "loss": 3.3485, + "mean_token_accuracy": 0.40064337849617004, + "num_tokens": 2672289327.0, + "step": 5225 + }, + { + "epoch": 1.4131963223363981, + "grad_norm": 2.15625, + "learning_rate": 0.01715298989822128, + "loss": 3.1049, + "mean_token_accuracy": 0.416240394115448, + "num_tokens": 2672813358.0, + "step": 5226 + }, + { + "epoch": 1.4134667387777178, + "grad_norm": 2.25, + "learning_rate": 0.017151839289156093, + "loss": 3.157, + "mean_token_accuracy": 0.40620216727256775, + "num_tokens": 2673247192.0, + "step": 5227 + }, + { + "epoch": 1.4137371552190374, + "grad_norm": 3.9375, + "learning_rate": 0.01715068849133439, + "loss": 3.421, + "mean_token_accuracy": 0.38307392597198486, + "num_tokens": 2673771207.0, + "step": 5228 + }, + { + "epoch": 1.414007571660357, + "grad_norm": 3.796875, + "learning_rate": 0.017149537504791475, + "loss": 3.3673, + "mean_token_accuracy": 0.3571215271949768, + "num_tokens": 2674295440.0, + "step": 5229 + }, + { + "epoch": 1.4142779881016767, + "grad_norm": 2.234375, + "learning_rate": 0.017148386329562663, + "loss": 3.2353, + "mean_token_accuracy": 0.39385145902633667, + "num_tokens": 2674819590.0, + "step": 5230 + }, + { + "epoch": 1.4145484045429964, + "grad_norm": 9.3125, + "learning_rate": 0.017147234965683277, + "loss": 17.1741, + "mean_token_accuracy": 0.0, + "num_tokens": 2675343867.0, + "step": 5231 + }, + { + "epoch": 1.4148188209843158, + "grad_norm": 6.8125, + "learning_rate": 0.01714608341318865, + "loss": 3.8561, + "mean_token_accuracy": 0.30547070503234863, + "num_tokens": 2675868065.0, + "step": 5232 + }, + { + "epoch": 1.4150892374256354, + "grad_norm": 2.6875, + "learning_rate": 0.017144931672114103, + "loss": 3.3967, + "mean_token_accuracy": 0.35451430082321167, + "num_tokens": 2676392318.0, + "step": 5233 + }, + { + "epoch": 1.415359653866955, + "grad_norm": 2.578125, + "learning_rate": 0.017143779742494984, + "loss": 3.3658, + "mean_token_accuracy": 0.36547526717185974, + "num_tokens": 2676916500.0, + "step": 5234 + }, + { + "epoch": 1.4156300703082747, + "grad_norm": 2.875, + "learning_rate": 0.017142627624366635, + "loss": 3.2698, + "mean_token_accuracy": 0.3856319189071655, + "num_tokens": 2677440676.0, + "step": 5235 + }, + { + "epoch": 1.4159004867495943, + "grad_norm": 3.15625, + "learning_rate": 0.01714147531776441, + "loss": 3.2177, + "mean_token_accuracy": 0.377209335565567, + "num_tokens": 2677961231.0, + "step": 5236 + }, + { + "epoch": 1.416170903190914, + "grad_norm": 2.8125, + "learning_rate": 0.017140322822723653, + "loss": 3.4751, + "mean_token_accuracy": 0.3667910099029541, + "num_tokens": 2678431250.0, + "step": 5237 + }, + { + "epoch": 1.4164413196322336, + "grad_norm": 3.421875, + "learning_rate": 0.01713917013927974, + "loss": 3.3228, + "mean_token_accuracy": 0.37139666080474854, + "num_tokens": 2678955446.0, + "step": 5238 + }, + { + "epoch": 1.4167117360735533, + "grad_norm": 3.71875, + "learning_rate": 0.017138017267468032, + "loss": 3.3025, + "mean_token_accuracy": 0.36486053466796875, + "num_tokens": 2679479592.0, + "step": 5239 + }, + { + "epoch": 1.416982152514873, + "grad_norm": 2.453125, + "learning_rate": 0.017136864207323903, + "loss": 3.3464, + "mean_token_accuracy": 0.36601942777633667, + "num_tokens": 2680003816.0, + "step": 5240 + }, + { + "epoch": 1.4172525689561926, + "grad_norm": 3.0, + "learning_rate": 0.017135710958882732, + "loss": 3.4187, + "mean_token_accuracy": 0.3875672221183777, + "num_tokens": 2680528007.0, + "step": 5241 + }, + { + "epoch": 1.4175229853975122, + "grad_norm": 2.390625, + "learning_rate": 0.0171345575221799, + "loss": 3.3536, + "mean_token_accuracy": 0.38070201873779297, + "num_tokens": 2681052266.0, + "step": 5242 + }, + { + "epoch": 1.4177934018388318, + "grad_norm": 3.171875, + "learning_rate": 0.017133403897250814, + "loss": 3.2048, + "mean_token_accuracy": 0.37103626132011414, + "num_tokens": 2681576394.0, + "step": 5243 + }, + { + "epoch": 1.4180638182801515, + "grad_norm": 3.09375, + "learning_rate": 0.017132250084130853, + "loss": 3.2085, + "mean_token_accuracy": 0.3658243715763092, + "num_tokens": 2682100581.0, + "step": 5244 + }, + { + "epoch": 1.4183342347214711, + "grad_norm": 3.03125, + "learning_rate": 0.017131096082855426, + "loss": 3.3127, + "mean_token_accuracy": 0.37997108697891235, + "num_tokens": 2682624856.0, + "step": 5245 + }, + { + "epoch": 1.4186046511627908, + "grad_norm": 3.203125, + "learning_rate": 0.01712994189345994, + "loss": 3.5031, + "mean_token_accuracy": 0.36035844683647156, + "num_tokens": 2683149042.0, + "step": 5246 + }, + { + "epoch": 1.4188750676041102, + "grad_norm": 2.015625, + "learning_rate": 0.01712878751597981, + "loss": 3.1262, + "mean_token_accuracy": 0.39532196521759033, + "num_tokens": 2683673187.0, + "step": 5247 + }, + { + "epoch": 1.4191454840454298, + "grad_norm": 3.1875, + "learning_rate": 0.017127632950450455, + "loss": 3.1741, + "mean_token_accuracy": 0.3740760385990143, + "num_tokens": 2684197414.0, + "step": 5248 + }, + { + "epoch": 1.4194159004867495, + "grad_norm": 3.296875, + "learning_rate": 0.0171264781969073, + "loss": 3.0219, + "mean_token_accuracy": 0.4052460193634033, + "num_tokens": 2684721657.0, + "step": 5249 + }, + { + "epoch": 1.4196863169280691, + "grad_norm": 2.578125, + "learning_rate": 0.01712532325538578, + "loss": 3.3637, + "mean_token_accuracy": 0.36921149492263794, + "num_tokens": 2685245940.0, + "step": 5250 + }, + { + "epoch": 1.4199567333693888, + "grad_norm": 3.203125, + "learning_rate": 0.017124168125921328, + "loss": 10.2464, + "mean_token_accuracy": 0.00634643342345953, + "num_tokens": 2685770179.0, + "step": 5251 + }, + { + "epoch": 1.4202271498107084, + "grad_norm": 8.1875, + "learning_rate": 0.01712301280854939, + "loss": 4.1608, + "mean_token_accuracy": 0.32481253147125244, + "num_tokens": 2686294441.0, + "step": 5252 + }, + { + "epoch": 1.420497566252028, + "grad_norm": 2.71875, + "learning_rate": 0.01712185730330541, + "loss": 3.208, + "mean_token_accuracy": 0.3757789433002472, + "num_tokens": 2686818614.0, + "step": 5253 + }, + { + "epoch": 1.4207679826933477, + "grad_norm": 2.8125, + "learning_rate": 0.017120701610224845, + "loss": 3.6431, + "mean_token_accuracy": 0.3809727728366852, + "num_tokens": 2687269912.0, + "step": 5254 + }, + { + "epoch": 1.4210383991346673, + "grad_norm": 3.0, + "learning_rate": 0.017119545729343155, + "loss": 3.3608, + "mean_token_accuracy": 0.3660966753959656, + "num_tokens": 2687794089.0, + "step": 5255 + }, + { + "epoch": 1.421308815575987, + "grad_norm": 3.59375, + "learning_rate": 0.017118389660695805, + "loss": 3.4848, + "mean_token_accuracy": 0.37630897760391235, + "num_tokens": 2688318308.0, + "step": 5256 + }, + { + "epoch": 1.4215792320173066, + "grad_norm": 3.5625, + "learning_rate": 0.017117233404318265, + "loss": 3.383, + "mean_token_accuracy": 0.3606851398944855, + "num_tokens": 2688837940.0, + "step": 5257 + }, + { + "epoch": 1.4218496484586263, + "grad_norm": 2.984375, + "learning_rate": 0.01711607696024602, + "loss": 3.2763, + "mean_token_accuracy": 0.39130738377571106, + "num_tokens": 2689362052.0, + "step": 5258 + }, + { + "epoch": 1.422120064899946, + "grad_norm": 22.875, + "learning_rate": 0.017114920328514544, + "loss": 3.2232, + "mean_token_accuracy": 0.3709898293018341, + "num_tokens": 2689886042.0, + "step": 5259 + }, + { + "epoch": 1.4223904813412656, + "grad_norm": 3.625, + "learning_rate": 0.017113763509159333, + "loss": 3.4025, + "mean_token_accuracy": 0.36932599544525146, + "num_tokens": 2690410318.0, + "step": 5260 + }, + { + "epoch": 1.4226608977825852, + "grad_norm": 2.203125, + "learning_rate": 0.017112606502215875, + "loss": 3.3006, + "mean_token_accuracy": 0.3784582018852234, + "num_tokens": 2690934476.0, + "step": 5261 + }, + { + "epoch": 1.4229313142239048, + "grad_norm": 3.703125, + "learning_rate": 0.017111449307719673, + "loss": 3.5569, + "mean_token_accuracy": 0.3646165728569031, + "num_tokens": 2691458643.0, + "step": 5262 + }, + { + "epoch": 1.4232017306652245, + "grad_norm": 2.5, + "learning_rate": 0.017110291925706237, + "loss": 3.2523, + "mean_token_accuracy": 0.37895143032073975, + "num_tokens": 2691982810.0, + "step": 5263 + }, + { + "epoch": 1.4234721471065441, + "grad_norm": 2.828125, + "learning_rate": 0.017109134356211073, + "loss": 3.4792, + "mean_token_accuracy": 0.36351561546325684, + "num_tokens": 2692507079.0, + "step": 5264 + }, + { + "epoch": 1.4237425635478638, + "grad_norm": 2.75, + "learning_rate": 0.0171079765992697, + "loss": 3.2761, + "mean_token_accuracy": 0.38100847601890564, + "num_tokens": 2692992926.0, + "step": 5265 + }, + { + "epoch": 1.4240129799891834, + "grad_norm": 3.0625, + "learning_rate": 0.017106818654917644, + "loss": 3.0974, + "mean_token_accuracy": 0.40672600269317627, + "num_tokens": 2693485314.0, + "step": 5266 + }, + { + "epoch": 1.424283396430503, + "grad_norm": 3.15625, + "learning_rate": 0.01710566052319043, + "loss": 3.4251, + "mean_token_accuracy": 0.3775511384010315, + "num_tokens": 2694009420.0, + "step": 5267 + }, + { + "epoch": 1.4245538128718227, + "grad_norm": 3.015625, + "learning_rate": 0.0171045022041236, + "loss": 3.3862, + "mean_token_accuracy": 0.3941294550895691, + "num_tokens": 2694523592.0, + "step": 5268 + }, + { + "epoch": 1.4248242293131423, + "grad_norm": 3.421875, + "learning_rate": 0.01710334369775269, + "loss": 3.4629, + "mean_token_accuracy": 0.38925620913505554, + "num_tokens": 2695019924.0, + "step": 5269 + }, + { + "epoch": 1.425094645754462, + "grad_norm": 3.296875, + "learning_rate": 0.017102185004113243, + "loss": 3.2434, + "mean_token_accuracy": 0.3648010492324829, + "num_tokens": 2695544099.0, + "step": 5270 + }, + { + "epoch": 1.4253650621957816, + "grad_norm": 123.0, + "learning_rate": 0.01710102612324082, + "loss": 16.7295, + "mean_token_accuracy": 0.025491271167993546, + "num_tokens": 2696068378.0, + "step": 5271 + }, + { + "epoch": 1.4256354786371013, + "grad_norm": 6.15625, + "learning_rate": 0.01709986705517097, + "loss": 3.8784, + "mean_token_accuracy": 0.3406057357788086, + "num_tokens": 2696592646.0, + "step": 5272 + }, + { + "epoch": 1.4259058950784207, + "grad_norm": 2.15625, + "learning_rate": 0.01709870779993926, + "loss": 3.3041, + "mean_token_accuracy": 0.37114161252975464, + "num_tokens": 2697116930.0, + "step": 5273 + }, + { + "epoch": 1.4261763115197403, + "grad_norm": 2.453125, + "learning_rate": 0.017097548357581257, + "loss": 3.3914, + "mean_token_accuracy": 0.37596315145492554, + "num_tokens": 2697641118.0, + "step": 5274 + }, + { + "epoch": 1.42644672796106, + "grad_norm": 3.390625, + "learning_rate": 0.017096388728132544, + "loss": 3.4617, + "mean_token_accuracy": 0.3637620508670807, + "num_tokens": 2698165317.0, + "step": 5275 + }, + { + "epoch": 1.4267171444023796, + "grad_norm": 2.546875, + "learning_rate": 0.017095228911628697, + "loss": 3.2866, + "mean_token_accuracy": 0.3769903779029846, + "num_tokens": 2698669117.0, + "step": 5276 + }, + { + "epoch": 1.4269875608436993, + "grad_norm": 3.25, + "learning_rate": 0.0170940689081053, + "loss": 3.3343, + "mean_token_accuracy": 0.34973564743995667, + "num_tokens": 2699193386.0, + "step": 5277 + }, + { + "epoch": 1.427257977285019, + "grad_norm": 2.265625, + "learning_rate": 0.017092908717597943, + "loss": 3.2357, + "mean_token_accuracy": 0.38729867339134216, + "num_tokens": 2699717606.0, + "step": 5278 + }, + { + "epoch": 1.4275283937263386, + "grad_norm": 3.015625, + "learning_rate": 0.017091748340142234, + "loss": 3.5044, + "mean_token_accuracy": 0.3604733347892761, + "num_tokens": 2700241777.0, + "step": 5279 + }, + { + "epoch": 1.4277988101676582, + "grad_norm": 2.21875, + "learning_rate": 0.017090587775773766, + "loss": 3.1359, + "mean_token_accuracy": 0.40520939230918884, + "num_tokens": 2700734574.0, + "step": 5280 + }, + { + "epoch": 1.4280692266089778, + "grad_norm": 2.625, + "learning_rate": 0.01708942702452816, + "loss": 3.3361, + "mean_token_accuracy": 0.39042216539382935, + "num_tokens": 2701210302.0, + "step": 5281 + }, + { + "epoch": 1.4283396430502975, + "grad_norm": 3.1875, + "learning_rate": 0.01708826608644102, + "loss": 3.3266, + "mean_token_accuracy": 0.37017327547073364, + "num_tokens": 2701734567.0, + "step": 5282 + }, + { + "epoch": 1.4286100594916171, + "grad_norm": 2.78125, + "learning_rate": 0.017087104961547975, + "loss": 3.3325, + "mean_token_accuracy": 0.3809773921966553, + "num_tokens": 2702258735.0, + "step": 5283 + }, + { + "epoch": 1.4288804759329368, + "grad_norm": 2.84375, + "learning_rate": 0.01708594364988465, + "loss": 2.9934, + "mean_token_accuracy": 0.3865198493003845, + "num_tokens": 2702783000.0, + "step": 5284 + }, + { + "epoch": 1.4291508923742564, + "grad_norm": 3.46875, + "learning_rate": 0.017084782151486674, + "loss": 3.3342, + "mean_token_accuracy": 0.3726383447647095, + "num_tokens": 2703307098.0, + "step": 5285 + }, + { + "epoch": 1.429421308815576, + "grad_norm": 4.3125, + "learning_rate": 0.017083620466389688, + "loss": 3.398, + "mean_token_accuracy": 0.3706698417663574, + "num_tokens": 2703776976.0, + "step": 5286 + }, + { + "epoch": 1.4296917252568957, + "grad_norm": 3.0, + "learning_rate": 0.017082458594629337, + "loss": 3.3137, + "mean_token_accuracy": 0.38598892092704773, + "num_tokens": 2704301002.0, + "step": 5287 + }, + { + "epoch": 1.4299621416982151, + "grad_norm": 2.6875, + "learning_rate": 0.017081296536241266, + "loss": 3.1143, + "mean_token_accuracy": 0.4105222821235657, + "num_tokens": 2704825247.0, + "step": 5288 + }, + { + "epoch": 1.4302325581395348, + "grad_norm": 2.765625, + "learning_rate": 0.017080134291261134, + "loss": 3.2948, + "mean_token_accuracy": 0.3998820185661316, + "num_tokens": 2705250466.0, + "step": 5289 + }, + { + "epoch": 1.4305029745808544, + "grad_norm": 2.734375, + "learning_rate": 0.0170789718597246, + "loss": 3.427, + "mean_token_accuracy": 0.3762056529521942, + "num_tokens": 2705750151.0, + "step": 5290 + }, + { + "epoch": 1.430773391022174, + "grad_norm": 57.25, + "learning_rate": 0.01707780924166734, + "loss": 23.8926, + "mean_token_accuracy": 0.0, + "num_tokens": 2706274337.0, + "step": 5291 + }, + { + "epoch": 1.4310438074634937, + "grad_norm": 11.4375, + "learning_rate": 0.017076646437125007, + "loss": 4.2519, + "mean_token_accuracy": 0.28574907779693604, + "num_tokens": 2706743987.0, + "step": 5292 + }, + { + "epoch": 1.4313142239048133, + "grad_norm": 3.359375, + "learning_rate": 0.017075483446133296, + "loss": 3.5707, + "mean_token_accuracy": 0.3372634947299957, + "num_tokens": 2707268268.0, + "step": 5293 + }, + { + "epoch": 1.431584640346133, + "grad_norm": 2.734375, + "learning_rate": 0.01707432026872789, + "loss": 3.466, + "mean_token_accuracy": 0.37090012431144714, + "num_tokens": 2707792446.0, + "step": 5294 + }, + { + "epoch": 1.4318550567874526, + "grad_norm": 3.5, + "learning_rate": 0.017073156904944473, + "loss": 3.4654, + "mean_token_accuracy": 0.37667590379714966, + "num_tokens": 2708294947.0, + "step": 5295 + }, + { + "epoch": 1.4321254732287723, + "grad_norm": 2.390625, + "learning_rate": 0.017071993354818737, + "loss": 3.3444, + "mean_token_accuracy": 0.3861374258995056, + "num_tokens": 2708805453.0, + "step": 5296 + }, + { + "epoch": 1.432395889670092, + "grad_norm": 2.59375, + "learning_rate": 0.017070829618386393, + "loss": 3.2446, + "mean_token_accuracy": 0.3830929398536682, + "num_tokens": 2709329549.0, + "step": 5297 + }, + { + "epoch": 1.4326663061114115, + "grad_norm": 3.34375, + "learning_rate": 0.017069665695683144, + "loss": 3.3371, + "mean_token_accuracy": 0.38192063570022583, + "num_tokens": 2709853826.0, + "step": 5298 + }, + { + "epoch": 1.4329367225527312, + "grad_norm": 3.734375, + "learning_rate": 0.0170685015867447, + "loss": 3.5462, + "mean_token_accuracy": 0.3615882396697998, + "num_tokens": 2710368081.0, + "step": 5299 + }, + { + "epoch": 1.4332071389940508, + "grad_norm": 2.65625, + "learning_rate": 0.01706733729160678, + "loss": 3.1708, + "mean_token_accuracy": 0.39619016647338867, + "num_tokens": 2710888693.0, + "step": 5300 + }, + { + "epoch": 1.4334775554353705, + "grad_norm": 3.4375, + "learning_rate": 0.01706617281030511, + "loss": 3.286, + "mean_token_accuracy": 0.372413694858551, + "num_tokens": 2711387838.0, + "step": 5301 + }, + { + "epoch": 1.4337479718766901, + "grad_norm": 3.1875, + "learning_rate": 0.01706500814287542, + "loss": 3.2891, + "mean_token_accuracy": 0.35325080156326294, + "num_tokens": 2711912074.0, + "step": 5302 + }, + { + "epoch": 1.4340183883180098, + "grad_norm": 3.046875, + "learning_rate": 0.017063843289353443, + "loss": 3.4013, + "mean_token_accuracy": 0.35999470949172974, + "num_tokens": 2712396750.0, + "step": 5303 + }, + { + "epoch": 1.4342888047593294, + "grad_norm": 2.8125, + "learning_rate": 0.01706267824977492, + "loss": 3.2449, + "mean_token_accuracy": 0.3768479824066162, + "num_tokens": 2712920958.0, + "step": 5304 + }, + { + "epoch": 1.434559221200649, + "grad_norm": 3.640625, + "learning_rate": 0.017061513024175603, + "loss": 2.8288, + "mean_token_accuracy": 0.46006184816360474, + "num_tokens": 2713445220.0, + "step": 5305 + }, + { + "epoch": 1.4348296376419687, + "grad_norm": 3.0625, + "learning_rate": 0.017060347612591235, + "loss": 3.3712, + "mean_token_accuracy": 0.3890972137451172, + "num_tokens": 2713884126.0, + "step": 5306 + }, + { + "epoch": 1.4351000540832883, + "grad_norm": 4.03125, + "learning_rate": 0.017059182015057583, + "loss": 3.4484, + "mean_token_accuracy": 0.38225042819976807, + "num_tokens": 2714408306.0, + "step": 5307 + }, + { + "epoch": 1.435370470524608, + "grad_norm": 2.765625, + "learning_rate": 0.01705801623161041, + "loss": 3.4742, + "mean_token_accuracy": 0.36144810914993286, + "num_tokens": 2714932569.0, + "step": 5308 + }, + { + "epoch": 1.4356408869659276, + "grad_norm": 2.796875, + "learning_rate": 0.01705685026228548, + "loss": 3.2299, + "mean_token_accuracy": 0.3960469365119934, + "num_tokens": 2715456774.0, + "step": 5309 + }, + { + "epoch": 1.4359113034072473, + "grad_norm": 2.75, + "learning_rate": 0.017055684107118576, + "loss": 3.2975, + "mean_token_accuracy": 0.3666743338108063, + "num_tokens": 2715981048.0, + "step": 5310 + }, + { + "epoch": 1.436181719848567, + "grad_norm": 65.5, + "learning_rate": 0.01705451776614547, + "loss": 19.3597, + "mean_token_accuracy": 0.02395864576101303, + "num_tokens": 2716505111.0, + "step": 5311 + }, + { + "epoch": 1.4364521362898865, + "grad_norm": 7.90625, + "learning_rate": 0.017053351239401954, + "loss": 3.2847, + "mean_token_accuracy": 0.387938916683197, + "num_tokens": 2717029232.0, + "step": 5312 + }, + { + "epoch": 1.4367225527312062, + "grad_norm": 2.75, + "learning_rate": 0.017052184526923825, + "loss": 3.2217, + "mean_token_accuracy": 0.40160229802131653, + "num_tokens": 2717552344.0, + "step": 5313 + }, + { + "epoch": 1.4369929691725256, + "grad_norm": 2.5, + "learning_rate": 0.017051017628746876, + "loss": 3.1859, + "mean_token_accuracy": 0.3954694867134094, + "num_tokens": 2718076562.0, + "step": 5314 + }, + { + "epoch": 1.4372633856138453, + "grad_norm": 3.9375, + "learning_rate": 0.017049850544906906, + "loss": 3.3847, + "mean_token_accuracy": 0.3332756757736206, + "num_tokens": 2718589224.0, + "step": 5315 + }, + { + "epoch": 1.437533802055165, + "grad_norm": 2.453125, + "learning_rate": 0.017048683275439732, + "loss": 3.4555, + "mean_token_accuracy": 0.37088340520858765, + "num_tokens": 2719113398.0, + "step": 5316 + }, + { + "epoch": 1.4378042184964845, + "grad_norm": 2.828125, + "learning_rate": 0.01704751582038117, + "loss": 3.387, + "mean_token_accuracy": 0.35790348052978516, + "num_tokens": 2719637507.0, + "step": 5317 + }, + { + "epoch": 1.4380746349378042, + "grad_norm": 2.390625, + "learning_rate": 0.017046348179767034, + "loss": 3.0589, + "mean_token_accuracy": 0.40694382786750793, + "num_tokens": 2720094076.0, + "step": 5318 + }, + { + "epoch": 1.4383450513791238, + "grad_norm": 3.1875, + "learning_rate": 0.017045180353633158, + "loss": 3.4744, + "mean_token_accuracy": 0.37631702423095703, + "num_tokens": 2720594043.0, + "step": 5319 + }, + { + "epoch": 1.4386154678204435, + "grad_norm": 4.0, + "learning_rate": 0.017044012342015368, + "loss": 3.1891, + "mean_token_accuracy": 0.38545137643814087, + "num_tokens": 2721060442.0, + "step": 5320 + }, + { + "epoch": 1.4388858842617631, + "grad_norm": 3.84375, + "learning_rate": 0.01704284414494951, + "loss": 3.341, + "mean_token_accuracy": 0.3737589120864868, + "num_tokens": 2721569174.0, + "step": 5321 + }, + { + "epoch": 1.4391563007030828, + "grad_norm": 3.40625, + "learning_rate": 0.01704167576247142, + "loss": 3.3679, + "mean_token_accuracy": 0.3619574308395386, + "num_tokens": 2722093419.0, + "step": 5322 + }, + { + "epoch": 1.4394267171444024, + "grad_norm": 2.734375, + "learning_rate": 0.017040507194616947, + "loss": 3.4993, + "mean_token_accuracy": 0.3608814775943756, + "num_tokens": 2722617639.0, + "step": 5323 + }, + { + "epoch": 1.439697133585722, + "grad_norm": 2.859375, + "learning_rate": 0.017039338441421956, + "loss": 3.4596, + "mean_token_accuracy": 0.3729664087295532, + "num_tokens": 2723141912.0, + "step": 5324 + }, + { + "epoch": 1.4399675500270417, + "grad_norm": 2.609375, + "learning_rate": 0.017038169502922293, + "loss": 3.2694, + "mean_token_accuracy": 0.36477935314178467, + "num_tokens": 2723666089.0, + "step": 5325 + }, + { + "epoch": 1.4402379664683613, + "grad_norm": 2.59375, + "learning_rate": 0.017037000379153837, + "loss": 3.1375, + "mean_token_accuracy": 0.39981013536453247, + "num_tokens": 2724156575.0, + "step": 5326 + }, + { + "epoch": 1.440508382909681, + "grad_norm": 2.65625, + "learning_rate": 0.017035831070152458, + "loss": 3.4132, + "mean_token_accuracy": 0.371218740940094, + "num_tokens": 2724664428.0, + "step": 5327 + }, + { + "epoch": 1.4407787993510006, + "grad_norm": 2.75, + "learning_rate": 0.017034661575954024, + "loss": 3.2907, + "mean_token_accuracy": 0.39800071716308594, + "num_tokens": 2725138009.0, + "step": 5328 + }, + { + "epoch": 1.44104921579232, + "grad_norm": 3.140625, + "learning_rate": 0.017033491896594432, + "loss": 3.3642, + "mean_token_accuracy": 0.36912381649017334, + "num_tokens": 2725662225.0, + "step": 5329 + }, + { + "epoch": 1.4413196322336397, + "grad_norm": 2.671875, + "learning_rate": 0.017032322032109568, + "loss": 2.8576, + "mean_token_accuracy": 0.4231243133544922, + "num_tokens": 2726141895.0, + "step": 5330 + }, + { + "epoch": 1.4415900486749593, + "grad_norm": 67.0, + "learning_rate": 0.017031151982535322, + "loss": 15.8531, + "mean_token_accuracy": 0.0, + "num_tokens": 2726666024.0, + "step": 5331 + }, + { + "epoch": 1.441860465116279, + "grad_norm": 9.1875, + "learning_rate": 0.01702998174790759, + "loss": 4.1807, + "mean_token_accuracy": 0.32381772994995117, + "num_tokens": 2727164394.0, + "step": 5332 + }, + { + "epoch": 1.4421308815575986, + "grad_norm": 3.453125, + "learning_rate": 0.017028811328262294, + "loss": 3.4206, + "mean_token_accuracy": 0.35930758714675903, + "num_tokens": 2727688592.0, + "step": 5333 + }, + { + "epoch": 1.4424012979989183, + "grad_norm": 2.78125, + "learning_rate": 0.017027640723635332, + "loss": 3.4018, + "mean_token_accuracy": 0.34481674432754517, + "num_tokens": 2728212843.0, + "step": 5334 + }, + { + "epoch": 1.442671714440238, + "grad_norm": 2.890625, + "learning_rate": 0.017026469934062625, + "loss": 3.6668, + "mean_token_accuracy": 0.3802223801612854, + "num_tokens": 2728674590.0, + "step": 5335 + }, + { + "epoch": 1.4429421308815575, + "grad_norm": 2.640625, + "learning_rate": 0.0170252989595801, + "loss": 3.2322, + "mean_token_accuracy": 0.3965889811515808, + "num_tokens": 2729163330.0, + "step": 5336 + }, + { + "epoch": 1.4432125473228772, + "grad_norm": 3.15625, + "learning_rate": 0.017024127800223682, + "loss": 3.3825, + "mean_token_accuracy": 0.3626823425292969, + "num_tokens": 2729687488.0, + "step": 5337 + }, + { + "epoch": 1.4434829637641968, + "grad_norm": 2.84375, + "learning_rate": 0.017022956456029314, + "loss": 3.3957, + "mean_token_accuracy": 0.36150163412094116, + "num_tokens": 2730211621.0, + "step": 5338 + }, + { + "epoch": 1.4437533802055165, + "grad_norm": 2.828125, + "learning_rate": 0.017021784927032924, + "loss": 3.206, + "mean_token_accuracy": 0.39973998069763184, + "num_tokens": 2730735748.0, + "step": 5339 + }, + { + "epoch": 1.444023796646836, + "grad_norm": 2.421875, + "learning_rate": 0.017020613213270464, + "loss": 3.1868, + "mean_token_accuracy": 0.39072680473327637, + "num_tokens": 2731215210.0, + "step": 5340 + }, + { + "epoch": 1.4442942130881558, + "grad_norm": 3.046875, + "learning_rate": 0.017019441314777888, + "loss": 3.2385, + "mean_token_accuracy": 0.3839207887649536, + "num_tokens": 2731739407.0, + "step": 5341 + }, + { + "epoch": 1.4445646295294754, + "grad_norm": 7.4375, + "learning_rate": 0.01701826923159114, + "loss": 3.6927, + "mean_token_accuracy": 0.3610589802265167, + "num_tokens": 2732204581.0, + "step": 5342 + }, + { + "epoch": 1.444835045970795, + "grad_norm": 2.21875, + "learning_rate": 0.017017096963746207, + "loss": 3.257, + "mean_token_accuracy": 0.391676664352417, + "num_tokens": 2732728795.0, + "step": 5343 + }, + { + "epoch": 1.4451054624121147, + "grad_norm": 3.171875, + "learning_rate": 0.017015924511279038, + "loss": 3.3416, + "mean_token_accuracy": 0.40091463923454285, + "num_tokens": 2733197192.0, + "step": 5344 + }, + { + "epoch": 1.4453758788534343, + "grad_norm": 3.046875, + "learning_rate": 0.01701475187422561, + "loss": 3.5363, + "mean_token_accuracy": 0.3651164174079895, + "num_tokens": 2733721364.0, + "step": 5345 + }, + { + "epoch": 1.445646295294754, + "grad_norm": 3.03125, + "learning_rate": 0.01701357905262191, + "loss": 3.0829, + "mean_token_accuracy": 0.393079549074173, + "num_tokens": 2734201019.0, + "step": 5346 + }, + { + "epoch": 1.4459167117360736, + "grad_norm": 2.453125, + "learning_rate": 0.01701240604650392, + "loss": 3.3465, + "mean_token_accuracy": 0.3764686584472656, + "num_tokens": 2734725219.0, + "step": 5347 + }, + { + "epoch": 1.4461871281773933, + "grad_norm": 3.40625, + "learning_rate": 0.01701123285590763, + "loss": 3.1907, + "mean_token_accuracy": 0.3874792456626892, + "num_tokens": 2735249378.0, + "step": 5348 + }, + { + "epoch": 1.446457544618713, + "grad_norm": 2.765625, + "learning_rate": 0.01701005948086904, + "loss": 3.3158, + "mean_token_accuracy": 0.38807809352874756, + "num_tokens": 2735773645.0, + "step": 5349 + }, + { + "epoch": 1.4467279610600325, + "grad_norm": 3.078125, + "learning_rate": 0.01700888592142415, + "loss": 3.2353, + "mean_token_accuracy": 0.38305139541625977, + "num_tokens": 2736297805.0, + "step": 5350 + }, + { + "epoch": 1.4469983775013522, + "grad_norm": 5.96875, + "learning_rate": 0.01700771217760897, + "loss": 10.087, + "mean_token_accuracy": 0.012119723483920097, + "num_tokens": 2736822043.0, + "step": 5351 + }, + { + "epoch": 1.4472687939426718, + "grad_norm": 8.375, + "learning_rate": 0.01700653824945951, + "loss": 3.8497, + "mean_token_accuracy": 0.3186342418193817, + "num_tokens": 2737346213.0, + "step": 5352 + }, + { + "epoch": 1.4475392103839915, + "grad_norm": 2.109375, + "learning_rate": 0.017005364137011796, + "loss": 3.2404, + "mean_token_accuracy": 0.3929552435874939, + "num_tokens": 2737870367.0, + "step": 5353 + }, + { + "epoch": 1.4478096268253111, + "grad_norm": 3.125, + "learning_rate": 0.017004189840301845, + "loss": 3.4809, + "mean_token_accuracy": 0.3655223250389099, + "num_tokens": 2738394558.0, + "step": 5354 + }, + { + "epoch": 1.4480800432666305, + "grad_norm": 2.734375, + "learning_rate": 0.017003015359365697, + "loss": 3.4614, + "mean_token_accuracy": 0.3778594732284546, + "num_tokens": 2738882046.0, + "step": 5355 + }, + { + "epoch": 1.4483504597079502, + "grad_norm": 3.6875, + "learning_rate": 0.017001840694239382, + "loss": 3.4357, + "mean_token_accuracy": 0.36125198006629944, + "num_tokens": 2739406183.0, + "step": 5356 + }, + { + "epoch": 1.4486208761492698, + "grad_norm": 2.53125, + "learning_rate": 0.017000665844958945, + "loss": 3.3318, + "mean_token_accuracy": 0.40185511112213135, + "num_tokens": 2739874135.0, + "step": 5357 + }, + { + "epoch": 1.4488912925905895, + "grad_norm": 3.59375, + "learning_rate": 0.01699949081156043, + "loss": 3.425, + "mean_token_accuracy": 0.3707902431488037, + "num_tokens": 2740398158.0, + "step": 5358 + }, + { + "epoch": 1.449161709031909, + "grad_norm": 3.328125, + "learning_rate": 0.016998315594079896, + "loss": 3.3546, + "mean_token_accuracy": 0.3742470145225525, + "num_tokens": 2740922306.0, + "step": 5359 + }, + { + "epoch": 1.4494321254732287, + "grad_norm": 3.96875, + "learning_rate": 0.0169971401925534, + "loss": 3.4393, + "mean_token_accuracy": 0.3515692353248596, + "num_tokens": 2741446309.0, + "step": 5360 + }, + { + "epoch": 1.4497025419145484, + "grad_norm": 3.5625, + "learning_rate": 0.01699596460701701, + "loss": 3.252, + "mean_token_accuracy": 0.4173441231250763, + "num_tokens": 2741949703.0, + "step": 5361 + }, + { + "epoch": 1.449972958355868, + "grad_norm": 2.359375, + "learning_rate": 0.016994788837506783, + "loss": 3.4354, + "mean_token_accuracy": 0.3776575028896332, + "num_tokens": 2742473859.0, + "step": 5362 + }, + { + "epoch": 1.4502433747971877, + "grad_norm": 3.703125, + "learning_rate": 0.016993612884058814, + "loss": 3.4615, + "mean_token_accuracy": 0.3669029772281647, + "num_tokens": 2742998062.0, + "step": 5363 + }, + { + "epoch": 1.4505137912385073, + "grad_norm": 3.09375, + "learning_rate": 0.016992436746709173, + "loss": 3.3781, + "mean_token_accuracy": 0.35906702280044556, + "num_tokens": 2743522189.0, + "step": 5364 + }, + { + "epoch": 1.450784207679827, + "grad_norm": 3.625, + "learning_rate": 0.01699126042549395, + "loss": 3.0808, + "mean_token_accuracy": 0.40426021814346313, + "num_tokens": 2744046301.0, + "step": 5365 + }, + { + "epoch": 1.4510546241211466, + "grad_norm": 2.984375, + "learning_rate": 0.016990083920449234, + "loss": 3.3421, + "mean_token_accuracy": 0.37420153617858887, + "num_tokens": 2744570575.0, + "step": 5366 + }, + { + "epoch": 1.4513250405624663, + "grad_norm": 3.234375, + "learning_rate": 0.016988907231611134, + "loss": 3.3783, + "mean_token_accuracy": 0.36539357900619507, + "num_tokens": 2745094665.0, + "step": 5367 + }, + { + "epoch": 1.451595457003786, + "grad_norm": 3.03125, + "learning_rate": 0.016987730359015744, + "loss": 3.4116, + "mean_token_accuracy": 0.37888991832733154, + "num_tokens": 2745589311.0, + "step": 5368 + }, + { + "epoch": 1.4518658734451055, + "grad_norm": 2.625, + "learning_rate": 0.01698655330269918, + "loss": 3.3227, + "mean_token_accuracy": 0.3871169984340668, + "num_tokens": 2746091028.0, + "step": 5369 + }, + { + "epoch": 1.452136289886425, + "grad_norm": 2.578125, + "learning_rate": 0.016985376062697552, + "loss": 3.293, + "mean_token_accuracy": 0.37669336795806885, + "num_tokens": 2746615229.0, + "step": 5370 + }, + { + "epoch": 1.4524067063277446, + "grad_norm": 41.25, + "learning_rate": 0.016984198639046988, + "loss": 24.4816, + "mean_token_accuracy": 0.03645939379930496, + "num_tokens": 2747139445.0, + "step": 5371 + }, + { + "epoch": 1.4526771227690642, + "grad_norm": 6.0, + "learning_rate": 0.01698302103178361, + "loss": 3.7685, + "mean_token_accuracy": 0.32091912627220154, + "num_tokens": 2747663728.0, + "step": 5372 + }, + { + "epoch": 1.4529475392103839, + "grad_norm": 2.40625, + "learning_rate": 0.016981843240943553, + "loss": 3.4053, + "mean_token_accuracy": 0.3715851306915283, + "num_tokens": 2748187923.0, + "step": 5373 + }, + { + "epoch": 1.4532179556517035, + "grad_norm": 3.375, + "learning_rate": 0.01698066526656295, + "loss": 3.4721, + "mean_token_accuracy": 0.36840376257896423, + "num_tokens": 2748712076.0, + "step": 5374 + }, + { + "epoch": 1.4534883720930232, + "grad_norm": 3.203125, + "learning_rate": 0.016979487108677957, + "loss": 3.2767, + "mean_token_accuracy": 0.38620442152023315, + "num_tokens": 2749236121.0, + "step": 5375 + }, + { + "epoch": 1.4537587885343428, + "grad_norm": 3.203125, + "learning_rate": 0.016978308767324705, + "loss": 3.3342, + "mean_token_accuracy": 0.36689823865890503, + "num_tokens": 2749760289.0, + "step": 5376 + }, + { + "epoch": 1.4540292049756625, + "grad_norm": 2.828125, + "learning_rate": 0.016977130242539364, + "loss": 3.4634, + "mean_token_accuracy": 0.36245477199554443, + "num_tokens": 2750284503.0, + "step": 5377 + }, + { + "epoch": 1.454299621416982, + "grad_norm": 4.0625, + "learning_rate": 0.01697595153435809, + "loss": 3.445, + "mean_token_accuracy": 0.3845233917236328, + "num_tokens": 2750792617.0, + "step": 5378 + }, + { + "epoch": 1.4545700378583017, + "grad_norm": 3.578125, + "learning_rate": 0.016974772642817045, + "loss": 3.2074, + "mean_token_accuracy": 0.3810245096683502, + "num_tokens": 2751316817.0, + "step": 5379 + }, + { + "epoch": 1.4548404542996214, + "grad_norm": 3.28125, + "learning_rate": 0.016973593567952413, + "loss": 3.207, + "mean_token_accuracy": 0.365559846162796, + "num_tokens": 2751841054.0, + "step": 5380 + }, + { + "epoch": 1.455110870740941, + "grad_norm": 2.71875, + "learning_rate": 0.016972414309800354, + "loss": 3.5238, + "mean_token_accuracy": 0.3783324956893921, + "num_tokens": 2752364365.0, + "step": 5381 + }, + { + "epoch": 1.4553812871822607, + "grad_norm": 3.046875, + "learning_rate": 0.016971234868397064, + "loss": 3.0971, + "mean_token_accuracy": 0.36606258153915405, + "num_tokens": 2752888627.0, + "step": 5382 + }, + { + "epoch": 1.4556517036235803, + "grad_norm": 5.1875, + "learning_rate": 0.01697005524377873, + "loss": 3.0472, + "mean_token_accuracy": 0.4504561126232147, + "num_tokens": 2753353771.0, + "step": 5383 + }, + { + "epoch": 1.4559221200649, + "grad_norm": 2.28125, + "learning_rate": 0.01696887543598154, + "loss": 3.3685, + "mean_token_accuracy": 0.38322657346725464, + "num_tokens": 2753878013.0, + "step": 5384 + }, + { + "epoch": 1.4561925365062196, + "grad_norm": 3.765625, + "learning_rate": 0.0169676954450417, + "loss": 3.3768, + "mean_token_accuracy": 0.3779813349246979, + "num_tokens": 2754369429.0, + "step": 5385 + }, + { + "epoch": 1.4564629529475392, + "grad_norm": 2.421875, + "learning_rate": 0.016966515270995415, + "loss": 3.5106, + "mean_token_accuracy": 0.3681856095790863, + "num_tokens": 2754875726.0, + "step": 5386 + }, + { + "epoch": 1.456733369388859, + "grad_norm": 3.15625, + "learning_rate": 0.016965334913878895, + "loss": 3.2966, + "mean_token_accuracy": 0.3596341013908386, + "num_tokens": 2755399883.0, + "step": 5387 + }, + { + "epoch": 1.4570037858301785, + "grad_norm": 3.625, + "learning_rate": 0.016964154373728356, + "loss": 3.1468, + "mean_token_accuracy": 0.3736456036567688, + "num_tokens": 2755924080.0, + "step": 5388 + }, + { + "epoch": 1.4572742022714982, + "grad_norm": 3.078125, + "learning_rate": 0.016962973650580023, + "loss": 3.3152, + "mean_token_accuracy": 0.3504646420478821, + "num_tokens": 2756448263.0, + "step": 5389 + }, + { + "epoch": 1.4575446187128178, + "grad_norm": 2.65625, + "learning_rate": 0.01696179274447012, + "loss": 3.2726, + "mean_token_accuracy": 0.3548968434333801, + "num_tokens": 2756972536.0, + "step": 5390 + }, + { + "epoch": 1.4578150351541375, + "grad_norm": 31.625, + "learning_rate": 0.016960611655434884, + "loss": 18.0997, + "mean_token_accuracy": 2.5368349270138424e-06, + "num_tokens": 2757496808.0, + "step": 5391 + }, + { + "epoch": 1.458085451595457, + "grad_norm": 6.25, + "learning_rate": 0.016959430383510558, + "loss": 3.9323, + "mean_token_accuracy": 0.3389895558357239, + "num_tokens": 2758008135.0, + "step": 5392 + }, + { + "epoch": 1.4583558680367767, + "grad_norm": 2.46875, + "learning_rate": 0.016958248928733374, + "loss": 3.5576, + "mean_token_accuracy": 0.3617214262485504, + "num_tokens": 2758521057.0, + "step": 5393 + }, + { + "epoch": 1.4586262844780964, + "grad_norm": 2.921875, + "learning_rate": 0.016957067291139598, + "loss": 3.3932, + "mean_token_accuracy": 0.3857344388961792, + "num_tokens": 2758987695.0, + "step": 5394 + }, + { + "epoch": 1.458896700919416, + "grad_norm": 3.984375, + "learning_rate": 0.016955885470765472, + "loss": 3.3551, + "mean_token_accuracy": 0.3654700517654419, + "num_tokens": 2759511902.0, + "step": 5395 + }, + { + "epoch": 1.4591671173607355, + "grad_norm": 2.671875, + "learning_rate": 0.016954703467647273, + "loss": 3.3944, + "mean_token_accuracy": 0.387611985206604, + "num_tokens": 2760036168.0, + "step": 5396 + }, + { + "epoch": 1.459437533802055, + "grad_norm": 3.5625, + "learning_rate": 0.016953521281821253, + "loss": 3.4338, + "mean_token_accuracy": 0.35972508788108826, + "num_tokens": 2760560435.0, + "step": 5397 + }, + { + "epoch": 1.4597079502433747, + "grad_norm": 3.109375, + "learning_rate": 0.016952338913323692, + "loss": 3.341, + "mean_token_accuracy": 0.38425213098526, + "num_tokens": 2761079467.0, + "step": 5398 + }, + { + "epoch": 1.4599783666846944, + "grad_norm": 3.65625, + "learning_rate": 0.016951156362190873, + "loss": 3.4347, + "mean_token_accuracy": 0.35951268672943115, + "num_tokens": 2761577915.0, + "step": 5399 + }, + { + "epoch": 1.460248783126014, + "grad_norm": 2.828125, + "learning_rate": 0.016949973628459073, + "loss": 3.2961, + "mean_token_accuracy": 0.36763662099838257, + "num_tokens": 2762101990.0, + "step": 5400 + }, + { + "epoch": 1.4605191995673337, + "grad_norm": 3.078125, + "learning_rate": 0.016948790712164583, + "loss": 3.3552, + "mean_token_accuracy": 0.40476900339126587, + "num_tokens": 2762561166.0, + "step": 5401 + }, + { + "epoch": 1.4607896160086533, + "grad_norm": 2.5625, + "learning_rate": 0.016947607613343702, + "loss": 3.4109, + "mean_token_accuracy": 0.3860585689544678, + "num_tokens": 2763085353.0, + "step": 5402 + }, + { + "epoch": 1.461060032449973, + "grad_norm": 2.90625, + "learning_rate": 0.01694642433203273, + "loss": 3.1182, + "mean_token_accuracy": 0.3621199131011963, + "num_tokens": 2763609626.0, + "step": 5403 + }, + { + "epoch": 1.4613304488912926, + "grad_norm": 2.71875, + "learning_rate": 0.01694524086826797, + "loss": 3.3544, + "mean_token_accuracy": 0.38615792989730835, + "num_tokens": 2764066021.0, + "step": 5404 + }, + { + "epoch": 1.4616008653326122, + "grad_norm": 2.734375, + "learning_rate": 0.016944057222085734, + "loss": 3.1937, + "mean_token_accuracy": 0.3602585196495056, + "num_tokens": 2764590297.0, + "step": 5405 + }, + { + "epoch": 1.4618712817739319, + "grad_norm": 2.546875, + "learning_rate": 0.016942873393522347, + "loss": 3.2885, + "mean_token_accuracy": 0.3778741955757141, + "num_tokens": 2765114400.0, + "step": 5406 + }, + { + "epoch": 1.4621416982152515, + "grad_norm": 2.703125, + "learning_rate": 0.016941689382614124, + "loss": 3.2375, + "mean_token_accuracy": 0.3858931064605713, + "num_tokens": 2765638661.0, + "step": 5407 + }, + { + "epoch": 1.4624121146565712, + "grad_norm": 3.859375, + "learning_rate": 0.0169405051893974, + "loss": 3.2827, + "mean_token_accuracy": 0.37578460574150085, + "num_tokens": 2766162868.0, + "step": 5408 + }, + { + "epoch": 1.4626825310978908, + "grad_norm": 3.09375, + "learning_rate": 0.016939320813908504, + "loss": 3.2936, + "mean_token_accuracy": 0.3951806426048279, + "num_tokens": 2766686857.0, + "step": 5409 + }, + { + "epoch": 1.4629529475392105, + "grad_norm": 3.0, + "learning_rate": 0.016938136256183782, + "loss": 3.2741, + "mean_token_accuracy": 0.3605692386627197, + "num_tokens": 2767211089.0, + "step": 5410 + }, + { + "epoch": 1.4632233639805299, + "grad_norm": 23.5, + "learning_rate": 0.016936951516259572, + "loss": 13.5627, + "mean_token_accuracy": 0.01847194880247116, + "num_tokens": 2767691474.0, + "step": 5411 + }, + { + "epoch": 1.4634937804218495, + "grad_norm": 6.4375, + "learning_rate": 0.016935766594172234, + "loss": 3.7431, + "mean_token_accuracy": 0.3558654189109802, + "num_tokens": 2768170987.0, + "step": 5412 + }, + { + "epoch": 1.4637641968631692, + "grad_norm": 2.3125, + "learning_rate": 0.01693458148995812, + "loss": 3.4665, + "mean_token_accuracy": 0.3577468991279602, + "num_tokens": 2768683781.0, + "step": 5413 + }, + { + "epoch": 1.4640346133044888, + "grad_norm": 3.890625, + "learning_rate": 0.016933396203653594, + "loss": 2.894, + "mean_token_accuracy": 0.4589380621910095, + "num_tokens": 2769128537.0, + "step": 5414 + }, + { + "epoch": 1.4643050297458085, + "grad_norm": 2.09375, + "learning_rate": 0.016932210735295022, + "loss": 3.1552, + "mean_token_accuracy": 0.3856866955757141, + "num_tokens": 2769652686.0, + "step": 5415 + }, + { + "epoch": 1.464575446187128, + "grad_norm": 3.15625, + "learning_rate": 0.01693102508491878, + "loss": 3.5126, + "mean_token_accuracy": 0.36107128858566284, + "num_tokens": 2770121267.0, + "step": 5416 + }, + { + "epoch": 1.4648458626284477, + "grad_norm": 2.59375, + "learning_rate": 0.01692983925256125, + "loss": 3.2308, + "mean_token_accuracy": 0.3968202769756317, + "num_tokens": 2770640149.0, + "step": 5417 + }, + { + "epoch": 1.4651162790697674, + "grad_norm": 3.625, + "learning_rate": 0.01692865323825881, + "loss": 3.3844, + "mean_token_accuracy": 0.37945038080215454, + "num_tokens": 2771164328.0, + "step": 5418 + }, + { + "epoch": 1.465386695511087, + "grad_norm": 3.03125, + "learning_rate": 0.016927467042047853, + "loss": 3.2728, + "mean_token_accuracy": 0.3994896411895752, + "num_tokens": 2771636832.0, + "step": 5419 + }, + { + "epoch": 1.4656571119524067, + "grad_norm": 3.25, + "learning_rate": 0.016926280663964774, + "loss": 3.3081, + "mean_token_accuracy": 0.3650292754173279, + "num_tokens": 2772160998.0, + "step": 5420 + }, + { + "epoch": 1.4659275283937263, + "grad_norm": 2.703125, + "learning_rate": 0.01692509410404598, + "loss": 3.4734, + "mean_token_accuracy": 0.38548916578292847, + "num_tokens": 2772672728.0, + "step": 5421 + }, + { + "epoch": 1.466197944835046, + "grad_norm": 2.984375, + "learning_rate": 0.01692390736232787, + "loss": 3.331, + "mean_token_accuracy": 0.37079572677612305, + "num_tokens": 2773160753.0, + "step": 5422 + }, + { + "epoch": 1.4664683612763656, + "grad_norm": 3.515625, + "learning_rate": 0.016922720438846864, + "loss": 3.2456, + "mean_token_accuracy": 0.425570011138916, + "num_tokens": 2773652174.0, + "step": 5423 + }, + { + "epoch": 1.4667387777176852, + "grad_norm": 2.96875, + "learning_rate": 0.016921533333639376, + "loss": 3.5388, + "mean_token_accuracy": 0.3851924240589142, + "num_tokens": 2774107337.0, + "step": 5424 + }, + { + "epoch": 1.4670091941590049, + "grad_norm": 3.390625, + "learning_rate": 0.016920346046741835, + "loss": 3.3009, + "mean_token_accuracy": 0.3878457248210907, + "num_tokens": 2774631570.0, + "step": 5425 + }, + { + "epoch": 1.4672796106003245, + "grad_norm": 3.234375, + "learning_rate": 0.016919158578190662, + "loss": 3.4554, + "mean_token_accuracy": 0.3766734302043915, + "num_tokens": 2775113108.0, + "step": 5426 + }, + { + "epoch": 1.4675500270416442, + "grad_norm": 4.09375, + "learning_rate": 0.0169179709280223, + "loss": 3.3773, + "mean_token_accuracy": 0.3906211256980896, + "num_tokens": 2775637328.0, + "step": 5427 + }, + { + "epoch": 1.4678204434829638, + "grad_norm": 2.984375, + "learning_rate": 0.01691678309627318, + "loss": 3.1477, + "mean_token_accuracy": 0.4359215497970581, + "num_tokens": 2776161591.0, + "step": 5428 + }, + { + "epoch": 1.4680908599242835, + "grad_norm": 2.625, + "learning_rate": 0.016915595082979764, + "loss": 3.2132, + "mean_token_accuracy": 0.4019066095352173, + "num_tokens": 2776622231.0, + "step": 5429 + }, + { + "epoch": 1.468361276365603, + "grad_norm": 2.296875, + "learning_rate": 0.01691440688817849, + "loss": 3.1992, + "mean_token_accuracy": 0.39148417115211487, + "num_tokens": 2777146459.0, + "step": 5430 + }, + { + "epoch": 1.4686316928069227, + "grad_norm": 17.5, + "learning_rate": 0.01691321851190582, + "loss": 16.3218, + "mean_token_accuracy": 2.6433208404341713e-06, + "num_tokens": 2777670739.0, + "step": 5431 + }, + { + "epoch": 1.4689021092482424, + "grad_norm": 7.625, + "learning_rate": 0.016912029954198212, + "loss": 4.0787, + "mean_token_accuracy": 0.25484806299209595, + "num_tokens": 2778169987.0, + "step": 5432 + }, + { + "epoch": 1.469172525689562, + "grad_norm": 2.4375, + "learning_rate": 0.016910841215092145, + "loss": 3.2336, + "mean_token_accuracy": 0.37887871265411377, + "num_tokens": 2778694146.0, + "step": 5433 + }, + { + "epoch": 1.4694429421308817, + "grad_norm": 2.703125, + "learning_rate": 0.01690965229462408, + "loss": 3.4931, + "mean_token_accuracy": 0.3754268288612366, + "num_tokens": 2779218395.0, + "step": 5434 + }, + { + "epoch": 1.4697133585722013, + "grad_norm": 3.296875, + "learning_rate": 0.01690846319283051, + "loss": 3.5586, + "mean_token_accuracy": 0.35807788372039795, + "num_tokens": 2779742501.0, + "step": 5435 + }, + { + "epoch": 1.469983775013521, + "grad_norm": 2.9375, + "learning_rate": 0.016907273909747913, + "loss": 3.2751, + "mean_token_accuracy": 0.38437333703041077, + "num_tokens": 2780266786.0, + "step": 5436 + }, + { + "epoch": 1.4702541914548404, + "grad_norm": 4.40625, + "learning_rate": 0.016906084445412778, + "loss": 3.602, + "mean_token_accuracy": 0.34466123580932617, + "num_tokens": 2780790979.0, + "step": 5437 + }, + { + "epoch": 1.47052460789616, + "grad_norm": 2.40625, + "learning_rate": 0.016904894799861604, + "loss": 3.3199, + "mean_token_accuracy": 0.3996530771255493, + "num_tokens": 2781315158.0, + "step": 5438 + }, + { + "epoch": 1.4707950243374797, + "grad_norm": 3.078125, + "learning_rate": 0.01690370497313089, + "loss": 3.3773, + "mean_token_accuracy": 0.36617475748062134, + "num_tokens": 2781812664.0, + "step": 5439 + }, + { + "epoch": 1.4710654407787993, + "grad_norm": 3.421875, + "learning_rate": 0.01690251496525715, + "loss": 3.5887, + "mean_token_accuracy": 0.35715553164482117, + "num_tokens": 2782336817.0, + "step": 5440 + }, + { + "epoch": 1.471335857220119, + "grad_norm": 4.625, + "learning_rate": 0.01690132477627689, + "loss": 3.2908, + "mean_token_accuracy": 0.3611328899860382, + "num_tokens": 2782861041.0, + "step": 5441 + }, + { + "epoch": 1.4716062736614386, + "grad_norm": 3.015625, + "learning_rate": 0.016900134406226633, + "loss": 3.3034, + "mean_token_accuracy": 0.36182910203933716, + "num_tokens": 2783385263.0, + "step": 5442 + }, + { + "epoch": 1.4718766901027582, + "grad_norm": 3.265625, + "learning_rate": 0.016898943855142897, + "loss": 3.4964, + "mean_token_accuracy": 0.34665292501449585, + "num_tokens": 2783909443.0, + "step": 5443 + }, + { + "epoch": 1.4721471065440779, + "grad_norm": 2.890625, + "learning_rate": 0.01689775312306222, + "loss": 3.4093, + "mean_token_accuracy": 0.377574622631073, + "num_tokens": 2784433668.0, + "step": 5444 + }, + { + "epoch": 1.4724175229853975, + "grad_norm": 3.4375, + "learning_rate": 0.016896562210021136, + "loss": 3.2961, + "mean_token_accuracy": 0.3775961399078369, + "num_tokens": 2784945507.0, + "step": 5445 + }, + { + "epoch": 1.4726879394267172, + "grad_norm": 2.484375, + "learning_rate": 0.016895371116056178, + "loss": 3.2773, + "mean_token_accuracy": 0.3826032280921936, + "num_tokens": 2785469677.0, + "step": 5446 + }, + { + "epoch": 1.4729583558680368, + "grad_norm": 3.09375, + "learning_rate": 0.016894179841203897, + "loss": 3.3487, + "mean_token_accuracy": 0.38403016328811646, + "num_tokens": 2785993928.0, + "step": 5447 + }, + { + "epoch": 1.4732287723093564, + "grad_norm": 2.78125, + "learning_rate": 0.016892988385500848, + "loss": 3.2176, + "mean_token_accuracy": 0.3949018716812134, + "num_tokens": 2786518059.0, + "step": 5448 + }, + { + "epoch": 1.473499188750676, + "grad_norm": 2.90625, + "learning_rate": 0.01689179674898358, + "loss": 3.3379, + "mean_token_accuracy": 0.3855777382850647, + "num_tokens": 2787033710.0, + "step": 5449 + }, + { + "epoch": 1.4737696051919957, + "grad_norm": 3.375, + "learning_rate": 0.016890604931688664, + "loss": 3.2925, + "mean_token_accuracy": 0.3594154119491577, + "num_tokens": 2787523221.0, + "step": 5450 + }, + { + "epoch": 1.4740400216333154, + "grad_norm": 19.625, + "learning_rate": 0.016889412933652665, + "loss": 14.9077, + "mean_token_accuracy": 0.04136389121413231, + "num_tokens": 2788047454.0, + "step": 5451 + }, + { + "epoch": 1.4743104380746348, + "grad_norm": 7.5625, + "learning_rate": 0.01688822075491216, + "loss": 3.9258, + "mean_token_accuracy": 0.2872151732444763, + "num_tokens": 2788548630.0, + "step": 5452 + }, + { + "epoch": 1.4745808545159544, + "grad_norm": 2.046875, + "learning_rate": 0.016887028395503725, + "loss": 3.3279, + "mean_token_accuracy": 0.36359861493110657, + "num_tokens": 2789072821.0, + "step": 5453 + }, + { + "epoch": 1.474851270957274, + "grad_norm": 2.75, + "learning_rate": 0.016885835855463945, + "loss": 3.3004, + "mean_token_accuracy": 0.40034520626068115, + "num_tokens": 2789596996.0, + "step": 5454 + }, + { + "epoch": 1.4751216873985937, + "grad_norm": 3.09375, + "learning_rate": 0.016884643134829413, + "loss": 3.3032, + "mean_token_accuracy": 0.3674209415912628, + "num_tokens": 2790118638.0, + "step": 5455 + }, + { + "epoch": 1.4753921038399134, + "grad_norm": 2.453125, + "learning_rate": 0.016883450233636722, + "loss": 3.3751, + "mean_token_accuracy": 0.399895042181015, + "num_tokens": 2790585249.0, + "step": 5456 + }, + { + "epoch": 1.475662520281233, + "grad_norm": 3.171875, + "learning_rate": 0.01688225715192248, + "loss": 2.9992, + "mean_token_accuracy": 0.39740675687789917, + "num_tokens": 2791109423.0, + "step": 5457 + }, + { + "epoch": 1.4759329367225527, + "grad_norm": 3.5625, + "learning_rate": 0.016881063889723286, + "loss": 3.0936, + "mean_token_accuracy": 0.39997926354408264, + "num_tokens": 2791633509.0, + "step": 5458 + }, + { + "epoch": 1.4762033531638723, + "grad_norm": 3.5625, + "learning_rate": 0.016879870447075757, + "loss": 3.3632, + "mean_token_accuracy": 0.3693559169769287, + "num_tokens": 2792099827.0, + "step": 5459 + }, + { + "epoch": 1.476473769605192, + "grad_norm": 2.453125, + "learning_rate": 0.016878676824016513, + "loss": 3.3375, + "mean_token_accuracy": 0.3714570105075836, + "num_tokens": 2792623709.0, + "step": 5460 + }, + { + "epoch": 1.4767441860465116, + "grad_norm": 3.171875, + "learning_rate": 0.016877483020582174, + "loss": 3.4952, + "mean_token_accuracy": 0.38068923354148865, + "num_tokens": 2793147883.0, + "step": 5461 + }, + { + "epoch": 1.4770146024878312, + "grad_norm": 2.671875, + "learning_rate": 0.016876289036809374, + "loss": 3.2097, + "mean_token_accuracy": 0.37765759229660034, + "num_tokens": 2793531313.0, + "step": 5462 + }, + { + "epoch": 1.4772850189291509, + "grad_norm": 2.984375, + "learning_rate": 0.016875094872734742, + "loss": 3.369, + "mean_token_accuracy": 0.3645525574684143, + "num_tokens": 2793995632.0, + "step": 5463 + }, + { + "epoch": 1.4775554353704705, + "grad_norm": 4.5625, + "learning_rate": 0.016873900528394922, + "loss": 3.5024, + "mean_token_accuracy": 0.37132275104522705, + "num_tokens": 2794467876.0, + "step": 5464 + }, + { + "epoch": 1.4778258518117902, + "grad_norm": 1.9296875, + "learning_rate": 0.016872706003826562, + "loss": 3.1045, + "mean_token_accuracy": 0.39276039600372314, + "num_tokens": 2794992011.0, + "step": 5465 + }, + { + "epoch": 1.4780962682531098, + "grad_norm": 2.71875, + "learning_rate": 0.016871511299066312, + "loss": 3.3763, + "mean_token_accuracy": 0.36703792214393616, + "num_tokens": 2795495776.0, + "step": 5466 + }, + { + "epoch": 1.4783666846944294, + "grad_norm": 2.71875, + "learning_rate": 0.016870316414150825, + "loss": 3.1158, + "mean_token_accuracy": 0.3902894854545593, + "num_tokens": 2796019964.0, + "step": 5467 + }, + { + "epoch": 1.478637101135749, + "grad_norm": 2.765625, + "learning_rate": 0.01686912134911677, + "loss": 3.283, + "mean_token_accuracy": 0.41149643063545227, + "num_tokens": 2796479416.0, + "step": 5468 + }, + { + "epoch": 1.4789075175770687, + "grad_norm": 2.671875, + "learning_rate": 0.01686792610400081, + "loss": 3.3191, + "mean_token_accuracy": 0.36776620149612427, + "num_tokens": 2797003586.0, + "step": 5469 + }, + { + "epoch": 1.4791779340183884, + "grad_norm": 3.46875, + "learning_rate": 0.01686673067883962, + "loss": 3.1444, + "mean_token_accuracy": 0.38739314675331116, + "num_tokens": 2797527790.0, + "step": 5470 + }, + { + "epoch": 1.479448350459708, + "grad_norm": 14.25, + "learning_rate": 0.016865535073669884, + "loss": 13.9158, + "mean_token_accuracy": 0.0, + "num_tokens": 2797994569.0, + "step": 5471 + }, + { + "epoch": 1.4797187669010277, + "grad_norm": 8.25, + "learning_rate": 0.01686433928852828, + "loss": 4.0778, + "mean_token_accuracy": 0.2987554669380188, + "num_tokens": 2798501282.0, + "step": 5472 + }, + { + "epoch": 1.4799891833423473, + "grad_norm": 2.0625, + "learning_rate": 0.0168631433234515, + "loss": 3.2982, + "mean_token_accuracy": 0.39177021384239197, + "num_tokens": 2799025550.0, + "step": 5473 + }, + { + "epoch": 1.480259599783667, + "grad_norm": 2.484375, + "learning_rate": 0.01686194717847624, + "loss": 3.1789, + "mean_token_accuracy": 0.37108004093170166, + "num_tokens": 2799549813.0, + "step": 5474 + }, + { + "epoch": 1.4805300162249866, + "grad_norm": 3.640625, + "learning_rate": 0.016860750853639205, + "loss": 3.4559, + "mean_token_accuracy": 0.38354843854904175, + "num_tokens": 2800074030.0, + "step": 5475 + }, + { + "epoch": 1.4808004326663062, + "grad_norm": 3.671875, + "learning_rate": 0.016859554348977096, + "loss": 3.3704, + "mean_token_accuracy": 0.3784157335758209, + "num_tokens": 2800598299.0, + "step": 5476 + }, + { + "epoch": 1.4810708491076259, + "grad_norm": 3.34375, + "learning_rate": 0.016858357664526626, + "loss": 3.4345, + "mean_token_accuracy": 0.3779028058052063, + "num_tokens": 2801122478.0, + "step": 5477 + }, + { + "epoch": 1.4813412655489453, + "grad_norm": 3.0625, + "learning_rate": 0.016857160800324518, + "loss": 3.3235, + "mean_token_accuracy": 0.36591219902038574, + "num_tokens": 2801646722.0, + "step": 5478 + }, + { + "epoch": 1.481611681990265, + "grad_norm": 2.75, + "learning_rate": 0.01685596375640749, + "loss": 3.3417, + "mean_token_accuracy": 0.38196101784706116, + "num_tokens": 2802142521.0, + "step": 5479 + }, + { + "epoch": 1.4818820984315846, + "grad_norm": 2.640625, + "learning_rate": 0.01685476653281227, + "loss": 3.0365, + "mean_token_accuracy": 0.4104467034339905, + "num_tokens": 2802666676.0, + "step": 5480 + }, + { + "epoch": 1.4821525148729042, + "grad_norm": 2.71875, + "learning_rate": 0.016853569129575603, + "loss": 3.3472, + "mean_token_accuracy": 0.37433838844299316, + "num_tokens": 2803190749.0, + "step": 5481 + }, + { + "epoch": 1.4824229313142239, + "grad_norm": 2.765625, + "learning_rate": 0.01685237154673421, + "loss": 3.2425, + "mean_token_accuracy": 0.3971550762653351, + "num_tokens": 2803663759.0, + "step": 5482 + }, + { + "epoch": 1.4826933477555435, + "grad_norm": 2.328125, + "learning_rate": 0.016851173784324856, + "loss": 3.2057, + "mean_token_accuracy": 0.38360559940338135, + "num_tokens": 2804188003.0, + "step": 5483 + }, + { + "epoch": 1.4829637641968632, + "grad_norm": 2.515625, + "learning_rate": 0.016849975842384284, + "loss": 3.1075, + "mean_token_accuracy": 0.40873458981513977, + "num_tokens": 2804663989.0, + "step": 5484 + }, + { + "epoch": 1.4832341806381828, + "grad_norm": 2.265625, + "learning_rate": 0.016848777720949244, + "loss": 3.1318, + "mean_token_accuracy": 0.39363062381744385, + "num_tokens": 2805188037.0, + "step": 5485 + }, + { + "epoch": 1.4835045970795024, + "grad_norm": 3.359375, + "learning_rate": 0.0168475794200565, + "loss": 3.4965, + "mean_token_accuracy": 0.3689471483230591, + "num_tokens": 2805712255.0, + "step": 5486 + }, + { + "epoch": 1.483775013520822, + "grad_norm": 3.078125, + "learning_rate": 0.01684638093974283, + "loss": 3.4206, + "mean_token_accuracy": 0.37182509899139404, + "num_tokens": 2806236414.0, + "step": 5487 + }, + { + "epoch": 1.4840454299621417, + "grad_norm": 2.96875, + "learning_rate": 0.016845182280045, + "loss": 3.1145, + "mean_token_accuracy": 0.425434947013855, + "num_tokens": 2806695015.0, + "step": 5488 + }, + { + "epoch": 1.4843158464034614, + "grad_norm": 2.578125, + "learning_rate": 0.016843983440999785, + "loss": 3.2985, + "mean_token_accuracy": 0.3992677628993988, + "num_tokens": 2807219252.0, + "step": 5489 + }, + { + "epoch": 1.484586262844781, + "grad_norm": 3.78125, + "learning_rate": 0.01684278442264397, + "loss": 3.5089, + "mean_token_accuracy": 0.3882373571395874, + "num_tokens": 2807641862.0, + "step": 5490 + }, + { + "epoch": 1.4848566792861007, + "grad_norm": 15.625, + "learning_rate": 0.016841585225014354, + "loss": 13.7837, + "mean_token_accuracy": 0.03565244376659393, + "num_tokens": 2808166136.0, + "step": 5491 + }, + { + "epoch": 1.4851270957274203, + "grad_norm": 7.4375, + "learning_rate": 0.016840385848147718, + "loss": 3.6041, + "mean_token_accuracy": 0.3414557874202728, + "num_tokens": 2808690244.0, + "step": 5492 + }, + { + "epoch": 1.48539751216874, + "grad_norm": 2.75, + "learning_rate": 0.01683918629208087, + "loss": 3.4731, + "mean_token_accuracy": 0.3799687325954437, + "num_tokens": 2809155284.0, + "step": 5493 + }, + { + "epoch": 1.4856679286100594, + "grad_norm": 3.140625, + "learning_rate": 0.016837986556850617, + "loss": 3.3634, + "mean_token_accuracy": 0.3750864565372467, + "num_tokens": 2809679554.0, + "step": 5494 + }, + { + "epoch": 1.485938345051379, + "grad_norm": 3.265625, + "learning_rate": 0.016836786642493766, + "loss": 3.4067, + "mean_token_accuracy": 0.3768744170665741, + "num_tokens": 2810203765.0, + "step": 5495 + }, + { + "epoch": 1.4862087614926986, + "grad_norm": 3.640625, + "learning_rate": 0.016835586549047134, + "loss": 3.2838, + "mean_token_accuracy": 0.3451949954032898, + "num_tokens": 2810727940.0, + "step": 5496 + }, + { + "epoch": 1.4864791779340183, + "grad_norm": 3.609375, + "learning_rate": 0.016834386276547545, + "loss": 3.5363, + "mean_token_accuracy": 0.35126978158950806, + "num_tokens": 2811252187.0, + "step": 5497 + }, + { + "epoch": 1.486749594375338, + "grad_norm": 3.265625, + "learning_rate": 0.016833185825031828, + "loss": 3.4599, + "mean_token_accuracy": 0.365648090839386, + "num_tokens": 2811776361.0, + "step": 5498 + }, + { + "epoch": 1.4870200108166576, + "grad_norm": 2.359375, + "learning_rate": 0.01683198519453682, + "loss": 3.1433, + "mean_token_accuracy": 0.392050176858902, + "num_tokens": 2812295339.0, + "step": 5499 + }, + { + "epoch": 1.4872904272579772, + "grad_norm": 2.59375, + "learning_rate": 0.016830784385099353, + "loss": 3.4334, + "mean_token_accuracy": 0.38330382108688354, + "num_tokens": 2812778267.0, + "step": 5500 + }, + { + "epoch": 1.4875608436992969, + "grad_norm": 2.453125, + "learning_rate": 0.016829583396756272, + "loss": 3.2344, + "mean_token_accuracy": 0.3644496202468872, + "num_tokens": 2813302542.0, + "step": 5501 + }, + { + "epoch": 1.4878312601406165, + "grad_norm": 3.296875, + "learning_rate": 0.01682838222954443, + "loss": 3.1063, + "mean_token_accuracy": 0.39170750975608826, + "num_tokens": 2813779696.0, + "step": 5502 + }, + { + "epoch": 1.4881016765819362, + "grad_norm": 2.546875, + "learning_rate": 0.016827180883500683, + "loss": 3.4127, + "mean_token_accuracy": 0.36905527114868164, + "num_tokens": 2814303963.0, + "step": 5503 + }, + { + "epoch": 1.4883720930232558, + "grad_norm": 3.3125, + "learning_rate": 0.016825979358661886, + "loss": 3.5242, + "mean_token_accuracy": 0.3773464560508728, + "num_tokens": 2814793944.0, + "step": 5504 + }, + { + "epoch": 1.4886425094645754, + "grad_norm": 2.453125, + "learning_rate": 0.01682477765506491, + "loss": 3.1804, + "mean_token_accuracy": 0.3907310962677002, + "num_tokens": 2815281204.0, + "step": 5505 + }, + { + "epoch": 1.488912925905895, + "grad_norm": 2.734375, + "learning_rate": 0.016823575772746627, + "loss": 3.1035, + "mean_token_accuracy": 0.40023308992385864, + "num_tokens": 2815805426.0, + "step": 5506 + }, + { + "epoch": 1.4891833423472147, + "grad_norm": 2.734375, + "learning_rate": 0.016822373711743915, + "loss": 3.1773, + "mean_token_accuracy": 0.3989884853363037, + "num_tokens": 2816267060.0, + "step": 5507 + }, + { + "epoch": 1.4894537587885344, + "grad_norm": 2.65625, + "learning_rate": 0.01682117147209365, + "loss": 3.3093, + "mean_token_accuracy": 0.37227779626846313, + "num_tokens": 2816791266.0, + "step": 5508 + }, + { + "epoch": 1.489724175229854, + "grad_norm": 3.203125, + "learning_rate": 0.01681996905383273, + "loss": 3.1152, + "mean_token_accuracy": 0.37791430950164795, + "num_tokens": 2817315397.0, + "step": 5509 + }, + { + "epoch": 1.4899945916711737, + "grad_norm": 2.734375, + "learning_rate": 0.016818766456998046, + "loss": 3.4229, + "mean_token_accuracy": 0.3700988292694092, + "num_tokens": 2817839560.0, + "step": 5510 + }, + { + "epoch": 1.4902650081124933, + "grad_norm": 114.0, + "learning_rate": 0.016817563681626492, + "loss": 17.7098, + "mean_token_accuracy": 0.0041578421369194984, + "num_tokens": 2818363845.0, + "step": 5511 + }, + { + "epoch": 1.490535424553813, + "grad_norm": 6.40625, + "learning_rate": 0.016816360727754977, + "loss": 3.5481, + "mean_token_accuracy": 0.35382354259490967, + "num_tokens": 2818887870.0, + "step": 5512 + }, + { + "epoch": 1.4908058409951326, + "grad_norm": 2.21875, + "learning_rate": 0.016815157595420408, + "loss": 3.4214, + "mean_token_accuracy": 0.3911857008934021, + "num_tokens": 2819356235.0, + "step": 5513 + }, + { + "epoch": 1.4910762574364522, + "grad_norm": 3.59375, + "learning_rate": 0.0168139542846597, + "loss": 3.4931, + "mean_token_accuracy": 0.36737656593322754, + "num_tokens": 2819880474.0, + "step": 5514 + }, + { + "epoch": 1.4913466738777719, + "grad_norm": 3.6875, + "learning_rate": 0.01681275079550978, + "loss": 3.2924, + "mean_token_accuracy": 0.38326242566108704, + "num_tokens": 2820404597.0, + "step": 5515 + }, + { + "epoch": 1.4916170903190915, + "grad_norm": 2.671875, + "learning_rate": 0.016811547128007573, + "loss": 3.3428, + "mean_token_accuracy": 0.3964850902557373, + "num_tokens": 2820872282.0, + "step": 5516 + }, + { + "epoch": 1.4918875067604112, + "grad_norm": 4.15625, + "learning_rate": 0.01681034328219001, + "loss": 3.1898, + "mean_token_accuracy": 0.43104639649391174, + "num_tokens": 2821351224.0, + "step": 5517 + }, + { + "epoch": 1.4921579232017308, + "grad_norm": 2.78125, + "learning_rate": 0.016809139258094026, + "loss": 3.2817, + "mean_token_accuracy": 0.3667742908000946, + "num_tokens": 2821875438.0, + "step": 5518 + }, + { + "epoch": 1.4924283396430502, + "grad_norm": 3.78125, + "learning_rate": 0.016807935055756566, + "loss": 3.3395, + "mean_token_accuracy": 0.3679245412349701, + "num_tokens": 2822399515.0, + "step": 5519 + }, + { + "epoch": 1.4926987560843699, + "grad_norm": 2.859375, + "learning_rate": 0.016806730675214578, + "loss": 3.1936, + "mean_token_accuracy": 0.38661283254623413, + "num_tokens": 2822913739.0, + "step": 5520 + }, + { + "epoch": 1.4929691725256895, + "grad_norm": 2.96875, + "learning_rate": 0.01680552611650502, + "loss": 3.1506, + "mean_token_accuracy": 0.39055562019348145, + "num_tokens": 2823437907.0, + "step": 5521 + }, + { + "epoch": 1.4932395889670091, + "grad_norm": 3.203125, + "learning_rate": 0.016804321379664842, + "loss": 3.2262, + "mean_token_accuracy": 0.366146981716156, + "num_tokens": 2823962010.0, + "step": 5522 + }, + { + "epoch": 1.4935100054083288, + "grad_norm": 2.765625, + "learning_rate": 0.016803116464731017, + "loss": 3.3899, + "mean_token_accuracy": 0.3765469193458557, + "num_tokens": 2824486123.0, + "step": 5523 + }, + { + "epoch": 1.4937804218496484, + "grad_norm": 3.375, + "learning_rate": 0.016801911371740516, + "loss": 3.3713, + "mean_token_accuracy": 0.37207943201065063, + "num_tokens": 2824957517.0, + "step": 5524 + }, + { + "epoch": 1.494050838290968, + "grad_norm": 2.90625, + "learning_rate": 0.016800706100730308, + "loss": 3.3575, + "mean_token_accuracy": 0.384738028049469, + "num_tokens": 2825481796.0, + "step": 5525 + }, + { + "epoch": 1.4943212547322877, + "grad_norm": 3.21875, + "learning_rate": 0.01679950065173738, + "loss": 3.2098, + "mean_token_accuracy": 0.3645217716693878, + "num_tokens": 2826005928.0, + "step": 5526 + }, + { + "epoch": 1.4945916711736074, + "grad_norm": 2.546875, + "learning_rate": 0.016798295024798716, + "loss": 3.3065, + "mean_token_accuracy": 0.3730960488319397, + "num_tokens": 2826530190.0, + "step": 5527 + }, + { + "epoch": 1.494862087614927, + "grad_norm": 3.15625, + "learning_rate": 0.016797089219951312, + "loss": 3.1637, + "mean_token_accuracy": 0.39659351110458374, + "num_tokens": 2827035150.0, + "step": 5528 + }, + { + "epoch": 1.4951325040562466, + "grad_norm": 2.65625, + "learning_rate": 0.01679588323723216, + "loss": 3.2143, + "mean_token_accuracy": 0.39934295415878296, + "num_tokens": 2827559408.0, + "step": 5529 + }, + { + "epoch": 1.4954029204975663, + "grad_norm": 3.484375, + "learning_rate": 0.016794677076678265, + "loss": 3.2284, + "mean_token_accuracy": 0.3734268844127655, + "num_tokens": 2828083659.0, + "step": 5530 + }, + { + "epoch": 1.495673336938886, + "grad_norm": 1.1953125, + "learning_rate": 0.016793470738326637, + "loss": 11.3365, + "mean_token_accuracy": 1.6786409105407074e-05, + "num_tokens": 2828607832.0, + "step": 5531 + }, + { + "epoch": 1.4959437533802056, + "grad_norm": 8.375, + "learning_rate": 0.016792264222214286, + "loss": 3.9164, + "mean_token_accuracy": 0.3341679871082306, + "num_tokens": 2829132103.0, + "step": 5532 + }, + { + "epoch": 1.4962141698215252, + "grad_norm": 1.9296875, + "learning_rate": 0.01679105752837824, + "loss": 3.2635, + "mean_token_accuracy": 0.3692528307437897, + "num_tokens": 2829656358.0, + "step": 5533 + }, + { + "epoch": 1.4964845862628449, + "grad_norm": 3.15625, + "learning_rate": 0.016789850656855513, + "loss": 3.2457, + "mean_token_accuracy": 0.3886508047580719, + "num_tokens": 2830130757.0, + "step": 5534 + }, + { + "epoch": 1.4967550027041643, + "grad_norm": 3.359375, + "learning_rate": 0.016788643607683146, + "loss": 3.3406, + "mean_token_accuracy": 0.3675556778907776, + "num_tokens": 2830654987.0, + "step": 5535 + }, + { + "epoch": 1.497025419145484, + "grad_norm": 3.015625, + "learning_rate": 0.016787436380898167, + "loss": 3.2618, + "mean_token_accuracy": 0.3794522285461426, + "num_tokens": 2831170453.0, + "step": 5536 + }, + { + "epoch": 1.4972958355868036, + "grad_norm": 3.4375, + "learning_rate": 0.016786228976537622, + "loss": 3.4577, + "mean_token_accuracy": 0.3758947551250458, + "num_tokens": 2831650764.0, + "step": 5537 + }, + { + "epoch": 1.4975662520281232, + "grad_norm": 3.015625, + "learning_rate": 0.01678502139463856, + "loss": 3.3519, + "mean_token_accuracy": 0.38358134031295776, + "num_tokens": 2832175041.0, + "step": 5538 + }, + { + "epoch": 1.4978366684694429, + "grad_norm": 3.28125, + "learning_rate": 0.01678381363523802, + "loss": 3.147, + "mean_token_accuracy": 0.37703466415405273, + "num_tokens": 2832699247.0, + "step": 5539 + }, + { + "epoch": 1.4981070849107625, + "grad_norm": 2.59375, + "learning_rate": 0.016782605698373074, + "loss": 3.2827, + "mean_token_accuracy": 0.39970070123672485, + "num_tokens": 2833166221.0, + "step": 5540 + }, + { + "epoch": 1.4983775013520821, + "grad_norm": 3.828125, + "learning_rate": 0.016781397584080777, + "loss": 3.6139, + "mean_token_accuracy": 0.3289607763290405, + "num_tokens": 2833690470.0, + "step": 5541 + }, + { + "epoch": 1.4986479177934018, + "grad_norm": 2.140625, + "learning_rate": 0.016780189292398204, + "loss": 3.4288, + "mean_token_accuracy": 0.3966383934020996, + "num_tokens": 2834155508.0, + "step": 5542 + }, + { + "epoch": 1.4989183342347214, + "grad_norm": 3.78125, + "learning_rate": 0.01677898082336242, + "loss": 3.2662, + "mean_token_accuracy": 0.37515443563461304, + "num_tokens": 2834679783.0, + "step": 5543 + }, + { + "epoch": 1.499188750676041, + "grad_norm": 2.640625, + "learning_rate": 0.016777772177010514, + "loss": 3.4273, + "mean_token_accuracy": 0.3815470337867737, + "num_tokens": 2835204062.0, + "step": 5544 + }, + { + "epoch": 1.4994591671173607, + "grad_norm": 4.03125, + "learning_rate": 0.016776563353379563, + "loss": 3.5212, + "mean_token_accuracy": 0.3584410548210144, + "num_tokens": 2835728181.0, + "step": 5545 + }, + { + "epoch": 1.4997295835586804, + "grad_norm": 3.671875, + "learning_rate": 0.016775354352506662, + "loss": 3.2331, + "mean_token_accuracy": 0.3710367679595947, + "num_tokens": 2836213441.0, + "step": 5546 + }, + { + "epoch": 1.5, + "grad_norm": 3.125, + "learning_rate": 0.016774145174428906, + "loss": 3.1507, + "mean_token_accuracy": 0.3964352607727051, + "num_tokens": 2836737710.0, + "step": 5547 + }, + { + "epoch": 1.5002704164413196, + "grad_norm": 7.28125, + "learning_rate": 0.016772935819183394, + "loss": 3.0626, + "mean_token_accuracy": 0.45814305543899536, + "num_tokens": 2837202259.0, + "step": 5548 + }, + { + "epoch": 1.5005408328826393, + "grad_norm": 2.375, + "learning_rate": 0.016771726286807235, + "loss": 3.172, + "mean_token_accuracy": 0.3696141541004181, + "num_tokens": 2837726291.0, + "step": 5549 + }, + { + "epoch": 1.500811249323959, + "grad_norm": 2.984375, + "learning_rate": 0.016770516577337538, + "loss": 3.0007, + "mean_token_accuracy": 0.3870357871055603, + "num_tokens": 2838207159.0, + "step": 5550 + }, + { + "epoch": 1.5010816657652786, + "grad_norm": 75.5, + "learning_rate": 0.016769306690811427, + "loss": 10.978, + "mean_token_accuracy": 0.0060938335955142975, + "num_tokens": 2838731369.0, + "step": 5551 + }, + { + "epoch": 1.5013520822065982, + "grad_norm": 9.125, + "learning_rate": 0.016768096627266013, + "loss": 4.0471, + "mean_token_accuracy": 0.32331395149230957, + "num_tokens": 2839233110.0, + "step": 5552 + }, + { + "epoch": 1.5016224986479179, + "grad_norm": 2.546875, + "learning_rate": 0.016766886386738438, + "loss": 3.5004, + "mean_token_accuracy": 0.3682089149951935, + "num_tokens": 2839720856.0, + "step": 5553 + }, + { + "epoch": 1.5018929150892375, + "grad_norm": 4.15625, + "learning_rate": 0.016765675969265828, + "loss": 3.2932, + "mean_token_accuracy": 0.3800123333930969, + "num_tokens": 2840245028.0, + "step": 5554 + }, + { + "epoch": 1.5021633315305571, + "grad_norm": 2.515625, + "learning_rate": 0.01676446537488532, + "loss": 3.3994, + "mean_token_accuracy": 0.37211549282073975, + "num_tokens": 2840732470.0, + "step": 5555 + }, + { + "epoch": 1.5024337479718768, + "grad_norm": 2.46875, + "learning_rate": 0.016763254603634065, + "loss": 3.4177, + "mean_token_accuracy": 0.36548662185668945, + "num_tokens": 2841241197.0, + "step": 5556 + }, + { + "epoch": 1.5027041644131964, + "grad_norm": 2.890625, + "learning_rate": 0.01676204365554921, + "loss": 3.2604, + "mean_token_accuracy": 0.36230987310409546, + "num_tokens": 2841713288.0, + "step": 5557 + }, + { + "epoch": 1.502974580854516, + "grad_norm": 2.3125, + "learning_rate": 0.016760832530667912, + "loss": 2.9615, + "mean_token_accuracy": 0.39171963930130005, + "num_tokens": 2842237513.0, + "step": 5558 + }, + { + "epoch": 1.5032449972958357, + "grad_norm": 2.578125, + "learning_rate": 0.016759621229027327, + "loss": 3.5215, + "mean_token_accuracy": 0.3434705436229706, + "num_tokens": 2842761798.0, + "step": 5559 + }, + { + "epoch": 1.5035154137371554, + "grad_norm": 4.96875, + "learning_rate": 0.01675840975066463, + "loss": 3.5552, + "mean_token_accuracy": 0.37458693981170654, + "num_tokens": 2843230538.0, + "step": 5560 + }, + { + "epoch": 1.503785830178475, + "grad_norm": 2.609375, + "learning_rate": 0.016757198095616984, + "loss": 3.3926, + "mean_token_accuracy": 0.3567889928817749, + "num_tokens": 2843754645.0, + "step": 5561 + }, + { + "epoch": 1.5040562466197946, + "grad_norm": 3.484375, + "learning_rate": 0.01675598626392157, + "loss": 3.3189, + "mean_token_accuracy": 0.39025968313217163, + "num_tokens": 2844222558.0, + "step": 5562 + }, + { + "epoch": 1.504326663061114, + "grad_norm": 2.859375, + "learning_rate": 0.016754774255615567, + "loss": 3.469, + "mean_token_accuracy": 0.34875375032424927, + "num_tokens": 2844746804.0, + "step": 5563 + }, + { + "epoch": 1.5045970795024337, + "grad_norm": 3.265625, + "learning_rate": 0.01675356207073617, + "loss": 3.4338, + "mean_token_accuracy": 0.38411033153533936, + "num_tokens": 2845270978.0, + "step": 5564 + }, + { + "epoch": 1.5048674959437534, + "grad_norm": 3.703125, + "learning_rate": 0.016752349709320567, + "loss": 3.5918, + "mean_token_accuracy": 0.35342103242874146, + "num_tokens": 2845743514.0, + "step": 5565 + }, + { + "epoch": 1.505137912385073, + "grad_norm": 2.53125, + "learning_rate": 0.01675113717140596, + "loss": 3.5682, + "mean_token_accuracy": 0.37699782848358154, + "num_tokens": 2846267733.0, + "step": 5566 + }, + { + "epoch": 1.5054083288263926, + "grad_norm": 3.328125, + "learning_rate": 0.01674992445702955, + "loss": 3.3454, + "mean_token_accuracy": 0.3690119981765747, + "num_tokens": 2846791887.0, + "step": 5567 + }, + { + "epoch": 1.5056787452677123, + "grad_norm": 2.515625, + "learning_rate": 0.01674871156622855, + "loss": 3.2727, + "mean_token_accuracy": 0.3859260380268097, + "num_tokens": 2847316029.0, + "step": 5568 + }, + { + "epoch": 1.505949161709032, + "grad_norm": 2.953125, + "learning_rate": 0.01674749849904017, + "loss": 3.3322, + "mean_token_accuracy": 0.38176465034484863, + "num_tokens": 2847840207.0, + "step": 5569 + }, + { + "epoch": 1.5062195781503516, + "grad_norm": 2.6875, + "learning_rate": 0.016746285255501637, + "loss": 3.4897, + "mean_token_accuracy": 0.37298235297203064, + "num_tokens": 2848364485.0, + "step": 5570 + }, + { + "epoch": 1.5064899945916712, + "grad_norm": 2.78125, + "learning_rate": 0.016745071835650174, + "loss": 10.8202, + "mean_token_accuracy": 2.220070564362686e-05, + "num_tokens": 2848888675.0, + "step": 5571 + }, + { + "epoch": 1.5067604110329909, + "grad_norm": 7.1875, + "learning_rate": 0.01674385823952301, + "loss": 3.7854, + "mean_token_accuracy": 0.3093189001083374, + "num_tokens": 2849357111.0, + "step": 5572 + }, + { + "epoch": 1.5070308274743103, + "grad_norm": 1.953125, + "learning_rate": 0.016742644467157383, + "loss": 3.2099, + "mean_token_accuracy": 0.38249534368515015, + "num_tokens": 2849881365.0, + "step": 5573 + }, + { + "epoch": 1.50730124391563, + "grad_norm": 2.4375, + "learning_rate": 0.01674143051859054, + "loss": 3.1734, + "mean_token_accuracy": 0.39620721340179443, + "num_tokens": 2850349024.0, + "step": 5574 + }, + { + "epoch": 1.5075716603569496, + "grad_norm": 3.046875, + "learning_rate": 0.016740216393859718, + "loss": 3.045, + "mean_token_accuracy": 0.3762771487236023, + "num_tokens": 2850873247.0, + "step": 5575 + }, + { + "epoch": 1.5078420767982692, + "grad_norm": 2.46875, + "learning_rate": 0.01673900209300218, + "loss": 3.5758, + "mean_token_accuracy": 0.3620595932006836, + "num_tokens": 2851397525.0, + "step": 5576 + }, + { + "epoch": 1.5081124932395888, + "grad_norm": 3.65625, + "learning_rate": 0.01673778761605518, + "loss": 3.2161, + "mean_token_accuracy": 0.38517606258392334, + "num_tokens": 2851921677.0, + "step": 5577 + }, + { + "epoch": 1.5083829096809085, + "grad_norm": 2.875, + "learning_rate": 0.01673657296305598, + "loss": 3.2727, + "mean_token_accuracy": 0.39261770248413086, + "num_tokens": 2852445782.0, + "step": 5578 + }, + { + "epoch": 1.5086533261222281, + "grad_norm": 3.703125, + "learning_rate": 0.016735358134041853, + "loss": 3.4121, + "mean_token_accuracy": 0.3388952612876892, + "num_tokens": 2852969980.0, + "step": 5579 + }, + { + "epoch": 1.5089237425635478, + "grad_norm": 2.640625, + "learning_rate": 0.016734143129050076, + "loss": 3.193, + "mean_token_accuracy": 0.3930162787437439, + "num_tokens": 2853494235.0, + "step": 5580 + }, + { + "epoch": 1.5091941590048674, + "grad_norm": 2.5625, + "learning_rate": 0.01673292794811792, + "loss": 3.2688, + "mean_token_accuracy": 0.3611209988594055, + "num_tokens": 2854018449.0, + "step": 5581 + }, + { + "epoch": 1.509464575446187, + "grad_norm": 2.625, + "learning_rate": 0.01673171259128267, + "loss": 3.3616, + "mean_token_accuracy": 0.3721092939376831, + "num_tokens": 2854542728.0, + "step": 5582 + }, + { + "epoch": 1.5097349918875067, + "grad_norm": 2.65625, + "learning_rate": 0.016730497058581633, + "loss": 3.1077, + "mean_token_accuracy": 0.39436855912208557, + "num_tokens": 2855066924.0, + "step": 5583 + }, + { + "epoch": 1.5100054083288263, + "grad_norm": 2.546875, + "learning_rate": 0.016729281350052087, + "loss": 3.2057, + "mean_token_accuracy": 0.3978121876716614, + "num_tokens": 2855557076.0, + "step": 5584 + }, + { + "epoch": 1.510275824770146, + "grad_norm": 2.75, + "learning_rate": 0.016728065465731343, + "loss": 3.2338, + "mean_token_accuracy": 0.3962623178958893, + "num_tokens": 2856033096.0, + "step": 5585 + }, + { + "epoch": 1.5105462412114656, + "grad_norm": 2.5625, + "learning_rate": 0.016726849405656705, + "loss": 3.315, + "mean_token_accuracy": 0.3834344446659088, + "num_tokens": 2856557365.0, + "step": 5586 + }, + { + "epoch": 1.5108166576527853, + "grad_norm": 3.078125, + "learning_rate": 0.016725633169865486, + "loss": 3.4422, + "mean_token_accuracy": 0.36990198493003845, + "num_tokens": 2857080404.0, + "step": 5587 + }, + { + "epoch": 1.511087074094105, + "grad_norm": 3.796875, + "learning_rate": 0.016724416758395002, + "loss": 3.3036, + "mean_token_accuracy": 0.38301146030426025, + "num_tokens": 2857604605.0, + "step": 5588 + }, + { + "epoch": 1.5113574905354246, + "grad_norm": 3.6875, + "learning_rate": 0.016723200171282575, + "loss": 3.4031, + "mean_token_accuracy": 0.3591381311416626, + "num_tokens": 2858128856.0, + "step": 5589 + }, + { + "epoch": 1.5116279069767442, + "grad_norm": 2.28125, + "learning_rate": 0.016721983408565542, + "loss": 3.2489, + "mean_token_accuracy": 0.3913372755050659, + "num_tokens": 2858653060.0, + "step": 5590 + }, + { + "epoch": 1.5118983234180638, + "grad_norm": 24.25, + "learning_rate": 0.01672076647028123, + "loss": 12.5392, + "mean_token_accuracy": 0.03673308342695236, + "num_tokens": 2859173849.0, + "step": 5591 + }, + { + "epoch": 1.5121687398593835, + "grad_norm": 7.59375, + "learning_rate": 0.016719549356466974, + "loss": 3.7628, + "mean_token_accuracy": 0.305844247341156, + "num_tokens": 2859680458.0, + "step": 5592 + }, + { + "epoch": 1.5124391563007031, + "grad_norm": 2.46875, + "learning_rate": 0.016718332067160127, + "loss": 3.3878, + "mean_token_accuracy": 0.381841242313385, + "num_tokens": 2860106579.0, + "step": 5593 + }, + { + "epoch": 1.5127095727420228, + "grad_norm": 4.3125, + "learning_rate": 0.016717114602398036, + "loss": 3.4471, + "mean_token_accuracy": 0.38415566086769104, + "num_tokens": 2860630735.0, + "step": 5594 + }, + { + "epoch": 1.5129799891833424, + "grad_norm": 3.421875, + "learning_rate": 0.016715896962218053, + "loss": 3.4372, + "mean_token_accuracy": 0.37219923734664917, + "num_tokens": 2861154768.0, + "step": 5595 + }, + { + "epoch": 1.513250405624662, + "grad_norm": 3.453125, + "learning_rate": 0.016714679146657543, + "loss": 3.3538, + "mean_token_accuracy": 0.38206303119659424, + "num_tokens": 2861678837.0, + "step": 5596 + }, + { + "epoch": 1.5135208220659817, + "grad_norm": 3.53125, + "learning_rate": 0.016713461155753874, + "loss": 3.3768, + "mean_token_accuracy": 0.3770256042480469, + "num_tokens": 2862202927.0, + "step": 5597 + }, + { + "epoch": 1.5137912385073014, + "grad_norm": 2.984375, + "learning_rate": 0.016712242989544414, + "loss": 3.4234, + "mean_token_accuracy": 0.3655382990837097, + "num_tokens": 2862700899.0, + "step": 5598 + }, + { + "epoch": 1.514061654948621, + "grad_norm": 2.8125, + "learning_rate": 0.016711024648066536, + "loss": 3.3863, + "mean_token_accuracy": 0.35660186409950256, + "num_tokens": 2863135532.0, + "step": 5599 + }, + { + "epoch": 1.5143320713899406, + "grad_norm": 4.03125, + "learning_rate": 0.01670980613135763, + "loss": 3.5179, + "mean_token_accuracy": 0.34043121337890625, + "num_tokens": 2863659696.0, + "step": 5600 + }, + { + "epoch": 1.5146024878312603, + "grad_norm": 2.765625, + "learning_rate": 0.01670858743945508, + "loss": 3.3498, + "mean_token_accuracy": 0.3833680748939514, + "num_tokens": 2864183785.0, + "step": 5601 + }, + { + "epoch": 1.51487290427258, + "grad_norm": 2.625, + "learning_rate": 0.016707368572396283, + "loss": 3.3206, + "mean_token_accuracy": 0.3818551301956177, + "num_tokens": 2864687257.0, + "step": 5602 + }, + { + "epoch": 1.5151433207138996, + "grad_norm": 53.75, + "learning_rate": 0.01670614953021863, + "loss": 3.2102, + "mean_token_accuracy": 0.3662724494934082, + "num_tokens": 2865211406.0, + "step": 5603 + }, + { + "epoch": 1.515413737155219, + "grad_norm": 4.21875, + "learning_rate": 0.016704930312959526, + "loss": 3.2478, + "mean_token_accuracy": 0.3705712556838989, + "num_tokens": 2865735686.0, + "step": 5604 + }, + { + "epoch": 1.5156841535965386, + "grad_norm": 3.359375, + "learning_rate": 0.016703710920656388, + "loss": 3.3539, + "mean_token_accuracy": 0.3831055164337158, + "num_tokens": 2866259897.0, + "step": 5605 + }, + { + "epoch": 1.5159545700378583, + "grad_norm": 3.828125, + "learning_rate": 0.016702491353346623, + "loss": 3.4704, + "mean_token_accuracy": 0.3779281675815582, + "num_tokens": 2866737501.0, + "step": 5606 + }, + { + "epoch": 1.516224986479178, + "grad_norm": 2.859375, + "learning_rate": 0.016701271611067655, + "loss": 3.2674, + "mean_token_accuracy": 0.38069412112236023, + "num_tokens": 2867261766.0, + "step": 5607 + }, + { + "epoch": 1.5164954029204976, + "grad_norm": 2.671875, + "learning_rate": 0.016700051693856906, + "loss": 3.0128, + "mean_token_accuracy": 0.390069842338562, + "num_tokens": 2867786000.0, + "step": 5608 + }, + { + "epoch": 1.5167658193618172, + "grad_norm": 2.625, + "learning_rate": 0.01669883160175181, + "loss": 3.3932, + "mean_token_accuracy": 0.38582277297973633, + "num_tokens": 2868216333.0, + "step": 5609 + }, + { + "epoch": 1.5170362358031368, + "grad_norm": 2.53125, + "learning_rate": 0.0166976113347898, + "loss": 3.1648, + "mean_token_accuracy": 0.396030068397522, + "num_tokens": 2868740599.0, + "step": 5610 + }, + { + "epoch": 1.5173066522444565, + "grad_norm": 9.0625, + "learning_rate": 0.01669639089300832, + "loss": 13.5129, + "mean_token_accuracy": 0.0, + "num_tokens": 2869210815.0, + "step": 5611 + }, + { + "epoch": 1.5175770686857761, + "grad_norm": 5.0625, + "learning_rate": 0.01669517027644481, + "loss": 3.6599, + "mean_token_accuracy": 0.3523402810096741, + "num_tokens": 2869729752.0, + "step": 5612 + }, + { + "epoch": 1.5178474851270958, + "grad_norm": 2.59375, + "learning_rate": 0.016693949485136732, + "loss": 3.2531, + "mean_token_accuracy": 0.36682820320129395, + "num_tokens": 2870254001.0, + "step": 5613 + }, + { + "epoch": 1.5181179015684152, + "grad_norm": 2.671875, + "learning_rate": 0.01669272851912154, + "loss": 3.1164, + "mean_token_accuracy": 0.4055417776107788, + "num_tokens": 2870778197.0, + "step": 5614 + }, + { + "epoch": 1.5183883180097348, + "grad_norm": 3.46875, + "learning_rate": 0.016691507378436693, + "loss": 3.3654, + "mean_token_accuracy": 0.3907735049724579, + "num_tokens": 2871265971.0, + "step": 5615 + }, + { + "epoch": 1.5186587344510545, + "grad_norm": 3.078125, + "learning_rate": 0.016690286063119668, + "loss": 3.5067, + "mean_token_accuracy": 0.38296133279800415, + "num_tokens": 2871790146.0, + "step": 5616 + }, + { + "epoch": 1.5189291508923741, + "grad_norm": 3.46875, + "learning_rate": 0.016689064573207926, + "loss": 3.3386, + "mean_token_accuracy": 0.374164879322052, + "num_tokens": 2872293772.0, + "step": 5617 + }, + { + "epoch": 1.5191995673336938, + "grad_norm": 2.28125, + "learning_rate": 0.016687842908738953, + "loss": 3.2051, + "mean_token_accuracy": 0.38884276151657104, + "num_tokens": 2872818029.0, + "step": 5618 + }, + { + "epoch": 1.5194699837750134, + "grad_norm": 2.671875, + "learning_rate": 0.016686621069750236, + "loss": 3.3386, + "mean_token_accuracy": 0.3776821196079254, + "num_tokens": 2873342198.0, + "step": 5619 + }, + { + "epoch": 1.519740400216333, + "grad_norm": 2.21875, + "learning_rate": 0.01668539905627926, + "loss": 3.1207, + "mean_token_accuracy": 0.4034459590911865, + "num_tokens": 2873817083.0, + "step": 5620 + }, + { + "epoch": 1.5200108166576527, + "grad_norm": 2.59375, + "learning_rate": 0.01668417686836352, + "loss": 3.4763, + "mean_token_accuracy": 0.3714466094970703, + "num_tokens": 2874341363.0, + "step": 5621 + }, + { + "epoch": 1.5202812330989723, + "grad_norm": 3.0625, + "learning_rate": 0.01668295450604052, + "loss": 3.3441, + "mean_token_accuracy": 0.36568713188171387, + "num_tokens": 2874859775.0, + "step": 5622 + }, + { + "epoch": 1.520551649540292, + "grad_norm": 3.0625, + "learning_rate": 0.016681731969347764, + "loss": 3.2419, + "mean_token_accuracy": 0.3830851912498474, + "num_tokens": 2875384049.0, + "step": 5623 + }, + { + "epoch": 1.5208220659816116, + "grad_norm": 3.546875, + "learning_rate": 0.016680509258322755, + "loss": 3.2102, + "mean_token_accuracy": 0.397161066532135, + "num_tokens": 2875908245.0, + "step": 5624 + }, + { + "epoch": 1.5210924824229313, + "grad_norm": 3.09375, + "learning_rate": 0.016679286373003022, + "loss": 3.2915, + "mean_token_accuracy": 0.36371445655822754, + "num_tokens": 2876432521.0, + "step": 5625 + }, + { + "epoch": 1.521362898864251, + "grad_norm": 3.421875, + "learning_rate": 0.01667806331342608, + "loss": 3.4928, + "mean_token_accuracy": 0.3706093430519104, + "num_tokens": 2876902574.0, + "step": 5626 + }, + { + "epoch": 1.5216333153055706, + "grad_norm": 2.71875, + "learning_rate": 0.016676840079629458, + "loss": 3.3217, + "mean_token_accuracy": 0.4046911597251892, + "num_tokens": 2877426736.0, + "step": 5627 + }, + { + "epoch": 1.5219037317468902, + "grad_norm": 3.140625, + "learning_rate": 0.01667561667165069, + "loss": 3.4762, + "mean_token_accuracy": 0.37897396087646484, + "num_tokens": 2877950983.0, + "step": 5628 + }, + { + "epoch": 1.5221741481882098, + "grad_norm": 2.21875, + "learning_rate": 0.016674393089527312, + "loss": 3.3573, + "mean_token_accuracy": 0.36652565002441406, + "num_tokens": 2878475264.0, + "step": 5629 + }, + { + "epoch": 1.5224445646295295, + "grad_norm": 3.984375, + "learning_rate": 0.016673169333296864, + "loss": 3.4526, + "mean_token_accuracy": 0.3807072043418884, + "num_tokens": 2878999546.0, + "step": 5630 + }, + { + "epoch": 1.5227149810708491, + "grad_norm": 188.0, + "learning_rate": 0.016671945402996897, + "loss": 21.7572, + "mean_token_accuracy": 5.115148815093562e-05, + "num_tokens": 2879473155.0, + "step": 5631 + }, + { + "epoch": 1.5229853975121688, + "grad_norm": 8.5, + "learning_rate": 0.016670721298664967, + "loss": 4.0889, + "mean_token_accuracy": 0.3097760081291199, + "num_tokens": 2879997425.0, + "step": 5632 + }, + { + "epoch": 1.5232558139534884, + "grad_norm": 2.4375, + "learning_rate": 0.016669497020338632, + "loss": 3.2669, + "mean_token_accuracy": 0.36940884590148926, + "num_tokens": 2880521665.0, + "step": 5633 + }, + { + "epoch": 1.523526230394808, + "grad_norm": 3.3125, + "learning_rate": 0.016668272568055448, + "loss": 3.4634, + "mean_token_accuracy": 0.34604957699775696, + "num_tokens": 2881045877.0, + "step": 5634 + }, + { + "epoch": 1.5237966468361277, + "grad_norm": 2.65625, + "learning_rate": 0.016667047941853, + "loss": 3.1565, + "mean_token_accuracy": 0.40086066722869873, + "num_tokens": 2881527051.0, + "step": 5635 + }, + { + "epoch": 1.5240670632774473, + "grad_norm": 3.359375, + "learning_rate": 0.016665823141768853, + "loss": 3.4327, + "mean_token_accuracy": 0.3818259537220001, + "num_tokens": 2882051314.0, + "step": 5636 + }, + { + "epoch": 1.524337479718767, + "grad_norm": 3.34375, + "learning_rate": 0.01666459816784059, + "loss": 3.1423, + "mean_token_accuracy": 0.3723786175251007, + "num_tokens": 2882487917.0, + "step": 5637 + }, + { + "epoch": 1.5246078961600866, + "grad_norm": 3.484375, + "learning_rate": 0.016663373020105798, + "loss": 3.3301, + "mean_token_accuracy": 0.35214248299598694, + "num_tokens": 2883012167.0, + "step": 5638 + }, + { + "epoch": 1.5248783126014063, + "grad_norm": 2.828125, + "learning_rate": 0.016662147698602064, + "loss": 3.251, + "mean_token_accuracy": 0.3800833225250244, + "num_tokens": 2883505734.0, + "step": 5639 + }, + { + "epoch": 1.525148729042726, + "grad_norm": 2.71875, + "learning_rate": 0.016660922203366988, + "loss": 3.34, + "mean_token_accuracy": 0.39343714714050293, + "num_tokens": 2883981030.0, + "step": 5640 + }, + { + "epoch": 1.5254191454840456, + "grad_norm": 2.453125, + "learning_rate": 0.016659696534438172, + "loss": 3.1721, + "mean_token_accuracy": 0.3710373044013977, + "num_tokens": 2884505251.0, + "step": 5641 + }, + { + "epoch": 1.5256895619253652, + "grad_norm": 2.25, + "learning_rate": 0.016658470691853222, + "loss": 3.1996, + "mean_token_accuracy": 0.3756527304649353, + "num_tokens": 2885029447.0, + "step": 5642 + }, + { + "epoch": 1.5259599783666848, + "grad_norm": 2.5, + "learning_rate": 0.016657244675649753, + "loss": 3.2031, + "mean_token_accuracy": 0.38163235783576965, + "num_tokens": 2885527437.0, + "step": 5643 + }, + { + "epoch": 1.5262303948080045, + "grad_norm": 2.796875, + "learning_rate": 0.016656018485865377, + "loss": 3.3433, + "mean_token_accuracy": 0.3919323682785034, + "num_tokens": 2886051697.0, + "step": 5644 + }, + { + "epoch": 1.526500811249324, + "grad_norm": 6.09375, + "learning_rate": 0.016654792122537725, + "loss": 3.7186, + "mean_token_accuracy": 0.3366192877292633, + "num_tokens": 2886575751.0, + "step": 5645 + }, + { + "epoch": 1.5267712276906436, + "grad_norm": 10.8125, + "learning_rate": 0.016653565585704414, + "loss": 3.5424, + "mean_token_accuracy": 0.3873409926891327, + "num_tokens": 2887099903.0, + "step": 5646 + }, + { + "epoch": 1.5270416441319632, + "grad_norm": 1.859375, + "learning_rate": 0.01665233887540309, + "loss": 3.4422, + "mean_token_accuracy": 0.3672252893447876, + "num_tokens": 2887624175.0, + "step": 5647 + }, + { + "epoch": 1.5273120605732828, + "grad_norm": 2.609375, + "learning_rate": 0.016651111991671387, + "loss": 3.359, + "mean_token_accuracy": 0.37180978059768677, + "num_tokens": 2888148338.0, + "step": 5648 + }, + { + "epoch": 1.5275824770146025, + "grad_norm": 3.515625, + "learning_rate": 0.016649884934546947, + "loss": 3.4046, + "mean_token_accuracy": 0.3643937110900879, + "num_tokens": 2888614719.0, + "step": 5649 + }, + { + "epoch": 1.5278528934559221, + "grad_norm": 2.734375, + "learning_rate": 0.016648657704067423, + "loss": 3.2913, + "mean_token_accuracy": 0.39011579751968384, + "num_tokens": 2889110430.0, + "step": 5650 + }, + { + "epoch": 1.5281233098972418, + "grad_norm": 20.5, + "learning_rate": 0.016647430300270466, + "loss": 11.5859, + "mean_token_accuracy": 0.007789941970258951, + "num_tokens": 2889634674.0, + "step": 5651 + }, + { + "epoch": 1.5283937263385614, + "grad_norm": 6.0, + "learning_rate": 0.016646202723193747, + "loss": 3.9416, + "mean_token_accuracy": 0.2992390990257263, + "num_tokens": 2890097914.0, + "step": 5652 + }, + { + "epoch": 1.528664142779881, + "grad_norm": 2.40625, + "learning_rate": 0.01664497497287492, + "loss": 3.1212, + "mean_token_accuracy": 0.3779028654098511, + "num_tokens": 2890622186.0, + "step": 5653 + }, + { + "epoch": 1.5289345592212007, + "grad_norm": 3.171875, + "learning_rate": 0.01664374704935166, + "loss": 3.6574, + "mean_token_accuracy": 0.3672313094139099, + "num_tokens": 2891084498.0, + "step": 5654 + }, + { + "epoch": 1.5292049756625201, + "grad_norm": 3.25, + "learning_rate": 0.01664251895266164, + "loss": 3.3321, + "mean_token_accuracy": 0.3829909563064575, + "num_tokens": 2891608737.0, + "step": 5655 + }, + { + "epoch": 1.5294753921038398, + "grad_norm": 2.65625, + "learning_rate": 0.016641290682842546, + "loss": 3.3169, + "mean_token_accuracy": 0.3650488555431366, + "num_tokens": 2892132687.0, + "step": 5656 + }, + { + "epoch": 1.5297458085451594, + "grad_norm": 2.734375, + "learning_rate": 0.016640062239932068, + "loss": 3.3739, + "mean_token_accuracy": 0.35414567589759827, + "num_tokens": 2892656899.0, + "step": 5657 + }, + { + "epoch": 1.530016224986479, + "grad_norm": 2.859375, + "learning_rate": 0.01663883362396789, + "loss": 3.3502, + "mean_token_accuracy": 0.37605154514312744, + "num_tokens": 2893181039.0, + "step": 5658 + }, + { + "epoch": 1.5302866414277987, + "grad_norm": 3.265625, + "learning_rate": 0.016637604834987715, + "loss": 3.2929, + "mean_token_accuracy": 0.36697909235954285, + "num_tokens": 2893680356.0, + "step": 5659 + }, + { + "epoch": 1.5305570578691183, + "grad_norm": 2.046875, + "learning_rate": 0.016636375873029245, + "loss": 3.2545, + "mean_token_accuracy": 0.39445289969444275, + "num_tokens": 2894191961.0, + "step": 5660 + }, + { + "epoch": 1.530827474310438, + "grad_norm": 5.5, + "learning_rate": 0.01663514673813019, + "loss": 3.0835, + "mean_token_accuracy": 0.42454129457473755, + "num_tokens": 2894716104.0, + "step": 5661 + }, + { + "epoch": 1.5310978907517576, + "grad_norm": 2.1875, + "learning_rate": 0.016633917430328257, + "loss": 3.2909, + "mean_token_accuracy": 0.3834415078163147, + "num_tokens": 2895240317.0, + "step": 5662 + }, + { + "epoch": 1.5313683071930773, + "grad_norm": 4.0625, + "learning_rate": 0.016632687949661173, + "loss": 3.529, + "mean_token_accuracy": 0.36752083897590637, + "num_tokens": 2895683218.0, + "step": 5663 + }, + { + "epoch": 1.531638723634397, + "grad_norm": 1.9140625, + "learning_rate": 0.016631458296166652, + "loss": 3.1989, + "mean_token_accuracy": 0.37989842891693115, + "num_tokens": 2896207295.0, + "step": 5664 + }, + { + "epoch": 1.5319091400757165, + "grad_norm": 2.890625, + "learning_rate": 0.016630228469882435, + "loss": 3.0465, + "mean_token_accuracy": 0.39755725860595703, + "num_tokens": 2896731495.0, + "step": 5665 + }, + { + "epoch": 1.5321795565170362, + "grad_norm": 2.453125, + "learning_rate": 0.01662899847084625, + "loss": 3.281, + "mean_token_accuracy": 0.39176011085510254, + "num_tokens": 2897255602.0, + "step": 5666 + }, + { + "epoch": 1.5324499729583558, + "grad_norm": 2.484375, + "learning_rate": 0.01662776829909584, + "loss": 3.076, + "mean_token_accuracy": 0.4119095802307129, + "num_tokens": 2897728320.0, + "step": 5667 + }, + { + "epoch": 1.5327203893996755, + "grad_norm": 2.515625, + "learning_rate": 0.016626537954668945, + "loss": 3.4514, + "mean_token_accuracy": 0.3669966459274292, + "num_tokens": 2898252594.0, + "step": 5668 + }, + { + "epoch": 1.5329908058409951, + "grad_norm": 3.25, + "learning_rate": 0.016625307437603317, + "loss": 3.2731, + "mean_token_accuracy": 0.3777737021446228, + "num_tokens": 2898776830.0, + "step": 5669 + }, + { + "epoch": 1.5332612222823148, + "grad_norm": 3.625, + "learning_rate": 0.016624076747936718, + "loss": 3.1822, + "mean_token_accuracy": 0.39962267875671387, + "num_tokens": 2899225883.0, + "step": 5670 + }, + { + "epoch": 1.5335316387236344, + "grad_norm": 46.75, + "learning_rate": 0.016622845885706904, + "loss": 12.6993, + "mean_token_accuracy": 0.02753761038184166, + "num_tokens": 2899750067.0, + "step": 5671 + }, + { + "epoch": 1.533802055164954, + "grad_norm": 7.59375, + "learning_rate": 0.016621614850951642, + "loss": 3.9426, + "mean_token_accuracy": 0.3261006474494934, + "num_tokens": 2900274330.0, + "step": 5672 + }, + { + "epoch": 1.5340724716062737, + "grad_norm": 1.921875, + "learning_rate": 0.016620383643708707, + "loss": 3.3124, + "mean_token_accuracy": 0.3816022276878357, + "num_tokens": 2900798561.0, + "step": 5673 + }, + { + "epoch": 1.5343428880475933, + "grad_norm": 3.21875, + "learning_rate": 0.01661915226401587, + "loss": 3.4543, + "mean_token_accuracy": 0.3627791106700897, + "num_tokens": 2901267010.0, + "step": 5674 + }, + { + "epoch": 1.534613304488913, + "grad_norm": 3.5625, + "learning_rate": 0.016617920711910917, + "loss": 3.1968, + "mean_token_accuracy": 0.38775986433029175, + "num_tokens": 2901791217.0, + "step": 5675 + }, + { + "epoch": 1.5348837209302326, + "grad_norm": 2.84375, + "learning_rate": 0.016616688987431637, + "loss": 3.3783, + "mean_token_accuracy": 0.3905133605003357, + "num_tokens": 2902290283.0, + "step": 5676 + }, + { + "epoch": 1.5351541373715523, + "grad_norm": 3.609375, + "learning_rate": 0.016615457090615817, + "loss": 3.2729, + "mean_token_accuracy": 0.38347309827804565, + "num_tokens": 2902814376.0, + "step": 5677 + }, + { + "epoch": 1.535424553812872, + "grad_norm": 3.09375, + "learning_rate": 0.016614225021501262, + "loss": 2.9796, + "mean_token_accuracy": 0.39959245920181274, + "num_tokens": 2903338508.0, + "step": 5678 + }, + { + "epoch": 1.5356949702541915, + "grad_norm": 3.578125, + "learning_rate": 0.016612992780125777, + "loss": 3.6592, + "mean_token_accuracy": 0.3584260940551758, + "num_tokens": 2903846026.0, + "step": 5679 + }, + { + "epoch": 1.5359653866955112, + "grad_norm": 2.578125, + "learning_rate": 0.016611760366527163, + "loss": 3.3003, + "mean_token_accuracy": 0.38701963424682617, + "num_tokens": 2904308289.0, + "step": 5680 + }, + { + "epoch": 1.5362358031368308, + "grad_norm": 3.140625, + "learning_rate": 0.016610527780743237, + "loss": 3.341, + "mean_token_accuracy": 0.3840807378292084, + "num_tokens": 2904824073.0, + "step": 5681 + }, + { + "epoch": 1.5365062195781505, + "grad_norm": 3.46875, + "learning_rate": 0.016609295022811818, + "loss": 3.3365, + "mean_token_accuracy": 0.3285939693450928, + "num_tokens": 2905348276.0, + "step": 5682 + }, + { + "epoch": 1.5367766360194701, + "grad_norm": 3.078125, + "learning_rate": 0.016608062092770735, + "loss": 3.6093, + "mean_token_accuracy": 0.35741811990737915, + "num_tokens": 2905872531.0, + "step": 5683 + }, + { + "epoch": 1.5370470524607898, + "grad_norm": 3.609375, + "learning_rate": 0.016606828990657817, + "loss": 3.2558, + "mean_token_accuracy": 0.3905262053012848, + "num_tokens": 2906396698.0, + "step": 5684 + }, + { + "epoch": 1.5373174689021094, + "grad_norm": 2.5625, + "learning_rate": 0.016605595716510893, + "loss": 3.3087, + "mean_token_accuracy": 0.3970709443092346, + "num_tokens": 2906893126.0, + "step": 5685 + }, + { + "epoch": 1.5375878853434288, + "grad_norm": 2.453125, + "learning_rate": 0.01660436227036781, + "loss": 3.2632, + "mean_token_accuracy": 0.37378114461898804, + "num_tokens": 2907390074.0, + "step": 5686 + }, + { + "epoch": 1.5378583017847485, + "grad_norm": 3.09375, + "learning_rate": 0.01660312865226641, + "loss": 3.2911, + "mean_token_accuracy": 0.348263680934906, + "num_tokens": 2907914320.0, + "step": 5687 + }, + { + "epoch": 1.5381287182260681, + "grad_norm": 2.9375, + "learning_rate": 0.016601894862244546, + "loss": 3.2941, + "mean_token_accuracy": 0.37224066257476807, + "num_tokens": 2908438595.0, + "step": 5688 + }, + { + "epoch": 1.5383991346673878, + "grad_norm": 2.984375, + "learning_rate": 0.016600660900340073, + "loss": 3.3496, + "mean_token_accuracy": 0.36743301153182983, + "num_tokens": 2908951676.0, + "step": 5689 + }, + { + "epoch": 1.5386695511087074, + "grad_norm": 3.359375, + "learning_rate": 0.016599426766590847, + "loss": 3.3105, + "mean_token_accuracy": 0.3796238899230957, + "num_tokens": 2909475859.0, + "step": 5690 + }, + { + "epoch": 1.538939967550027, + "grad_norm": 40.5, + "learning_rate": 0.01659819246103475, + "loss": 27.3744, + "mean_token_accuracy": 0.0, + "num_tokens": 2909999932.0, + "step": 5691 + }, + { + "epoch": 1.5392103839913467, + "grad_norm": 5.3125, + "learning_rate": 0.01659695798370964, + "loss": 3.6315, + "mean_token_accuracy": 0.32148611545562744, + "num_tokens": 2910524112.0, + "step": 5692 + }, + { + "epoch": 1.5394808004326663, + "grad_norm": 1.9765625, + "learning_rate": 0.016595723334653402, + "loss": 3.3917, + "mean_token_accuracy": 0.3588913381099701, + "num_tokens": 2911048376.0, + "step": 5693 + }, + { + "epoch": 1.539751216873986, + "grad_norm": 3.265625, + "learning_rate": 0.016594488513903913, + "loss": 3.2975, + "mean_token_accuracy": 0.384898841381073, + "num_tokens": 2911572550.0, + "step": 5694 + }, + { + "epoch": 1.5400216333153056, + "grad_norm": 2.96875, + "learning_rate": 0.01659325352149907, + "loss": 3.4409, + "mean_token_accuracy": 0.37165307998657227, + "num_tokens": 2912045845.0, + "step": 5695 + }, + { + "epoch": 1.540292049756625, + "grad_norm": 2.59375, + "learning_rate": 0.016592018357476753, + "loss": 3.0837, + "mean_token_accuracy": 0.41000962257385254, + "num_tokens": 2912533967.0, + "step": 5696 + }, + { + "epoch": 1.5405624661979447, + "grad_norm": 4.28125, + "learning_rate": 0.01659078302187487, + "loss": 3.1675, + "mean_token_accuracy": 0.4405975937843323, + "num_tokens": 2913058210.0, + "step": 5697 + }, + { + "epoch": 1.5408328826392643, + "grad_norm": 3.46875, + "learning_rate": 0.016589547514731323, + "loss": 3.1166, + "mean_token_accuracy": 0.38537418842315674, + "num_tokens": 2913582350.0, + "step": 5698 + }, + { + "epoch": 1.541103299080584, + "grad_norm": 3.046875, + "learning_rate": 0.016588311836084023, + "loss": 3.3088, + "mean_token_accuracy": 0.38853123784065247, + "num_tokens": 2914106461.0, + "step": 5699 + }, + { + "epoch": 1.5413737155219036, + "grad_norm": 2.6875, + "learning_rate": 0.01658707598597088, + "loss": 3.3866, + "mean_token_accuracy": 0.3590236306190491, + "num_tokens": 2914630723.0, + "step": 5700 + }, + { + "epoch": 1.5416441319632233, + "grad_norm": 2.5625, + "learning_rate": 0.016585839964429815, + "loss": 3.1398, + "mean_token_accuracy": 0.4013952612876892, + "num_tokens": 2915154977.0, + "step": 5701 + }, + { + "epoch": 1.541914548404543, + "grad_norm": 3.171875, + "learning_rate": 0.016584603771498754, + "loss": 3.4096, + "mean_token_accuracy": 0.41978251934051514, + "num_tokens": 2915614835.0, + "step": 5702 + }, + { + "epoch": 1.5421849648458625, + "grad_norm": 3.25, + "learning_rate": 0.016583367407215624, + "loss": 3.2621, + "mean_token_accuracy": 0.40100401639938354, + "num_tokens": 2916138995.0, + "step": 5703 + }, + { + "epoch": 1.5424553812871822, + "grad_norm": 3.34375, + "learning_rate": 0.016582130871618365, + "loss": 3.2985, + "mean_token_accuracy": 0.3776020407676697, + "num_tokens": 2916607191.0, + "step": 5704 + }, + { + "epoch": 1.5427257977285018, + "grad_norm": 3.0625, + "learning_rate": 0.016580894164744914, + "loss": 3.4402, + "mean_token_accuracy": 0.37637704610824585, + "num_tokens": 2917131321.0, + "step": 5705 + }, + { + "epoch": 1.5429962141698215, + "grad_norm": 4.5, + "learning_rate": 0.016579657286633215, + "loss": 3.0888, + "mean_token_accuracy": 0.4308176338672638, + "num_tokens": 2917655591.0, + "step": 5706 + }, + { + "epoch": 1.543266630611141, + "grad_norm": 6.78125, + "learning_rate": 0.016578420237321225, + "loss": 3.5069, + "mean_token_accuracy": 0.4231441915035248, + "num_tokens": 2918055901.0, + "step": 5707 + }, + { + "epoch": 1.5435370470524608, + "grad_norm": 2.390625, + "learning_rate": 0.016577183016846898, + "loss": 3.2706, + "mean_token_accuracy": 0.3615175187587738, + "num_tokens": 2918580051.0, + "step": 5708 + }, + { + "epoch": 1.5438074634937804, + "grad_norm": 3.59375, + "learning_rate": 0.016575945625248188, + "loss": 3.2763, + "mean_token_accuracy": 0.37365907430648804, + "num_tokens": 2919104235.0, + "step": 5709 + }, + { + "epoch": 1.5440778799351, + "grad_norm": 4.59375, + "learning_rate": 0.016574708062563077, + "loss": 3.2886, + "mean_token_accuracy": 0.3736264109611511, + "num_tokens": 2919628371.0, + "step": 5710 + }, + { + "epoch": 1.5443482963764197, + "grad_norm": 19.625, + "learning_rate": 0.016573470328829523, + "loss": 11.8401, + "mean_token_accuracy": 0.0398639440536499, + "num_tokens": 2920152639.0, + "step": 5711 + }, + { + "epoch": 1.5446187128177393, + "grad_norm": 8.9375, + "learning_rate": 0.016572232424085512, + "loss": 3.9372, + "mean_token_accuracy": 0.3500904440879822, + "num_tokens": 2920670776.0, + "step": 5712 + }, + { + "epoch": 1.544889129259059, + "grad_norm": 2.203125, + "learning_rate": 0.01657099434836902, + "loss": 3.4323, + "mean_token_accuracy": 0.3115849792957306, + "num_tokens": 2921194996.0, + "step": 5713 + }, + { + "epoch": 1.5451595457003786, + "grad_norm": 2.9375, + "learning_rate": 0.01656975610171804, + "loss": 3.6614, + "mean_token_accuracy": 0.3496614992618561, + "num_tokens": 2921719253.0, + "step": 5714 + }, + { + "epoch": 1.5454299621416983, + "grad_norm": 3.8125, + "learning_rate": 0.016568517684170562, + "loss": 3.5729, + "mean_token_accuracy": 0.3550707995891571, + "num_tokens": 2922243477.0, + "step": 5715 + }, + { + "epoch": 1.545700378583018, + "grad_norm": 2.71875, + "learning_rate": 0.016567279095764588, + "loss": 3.3815, + "mean_token_accuracy": 0.3720541000366211, + "num_tokens": 2922767673.0, + "step": 5716 + }, + { + "epoch": 1.5459707950243375, + "grad_norm": 2.859375, + "learning_rate": 0.016566040336538114, + "loss": 3.4679, + "mean_token_accuracy": 0.363263875246048, + "num_tokens": 2923263257.0, + "step": 5717 + }, + { + "epoch": 1.5462412114656572, + "grad_norm": 2.921875, + "learning_rate": 0.016564801406529158, + "loss": 3.3906, + "mean_token_accuracy": 0.3740221858024597, + "num_tokens": 2923787433.0, + "step": 5718 + }, + { + "epoch": 1.5465116279069768, + "grad_norm": 4.625, + "learning_rate": 0.01656356230577573, + "loss": 3.4722, + "mean_token_accuracy": 0.3604312539100647, + "num_tokens": 2924311532.0, + "step": 5719 + }, + { + "epoch": 1.5467820443482965, + "grad_norm": 2.765625, + "learning_rate": 0.01656232303431585, + "loss": 3.0251, + "mean_token_accuracy": 0.3892058730125427, + "num_tokens": 2924813204.0, + "step": 5720 + }, + { + "epoch": 1.5470524607896161, + "grad_norm": 2.765625, + "learning_rate": 0.01656108359218754, + "loss": 3.3488, + "mean_token_accuracy": 0.37048086524009705, + "num_tokens": 2925337485.0, + "step": 5721 + }, + { + "epoch": 1.5473228772309358, + "grad_norm": 3.421875, + "learning_rate": 0.01655984397942883, + "loss": 3.3265, + "mean_token_accuracy": 0.371364951133728, + "num_tokens": 2925861668.0, + "step": 5722 + }, + { + "epoch": 1.5475932936722554, + "grad_norm": 3.046875, + "learning_rate": 0.01655860419607776, + "loss": 3.3136, + "mean_token_accuracy": 0.3944058418273926, + "num_tokens": 2926385869.0, + "step": 5723 + }, + { + "epoch": 1.547863710113575, + "grad_norm": 5.90625, + "learning_rate": 0.01655736424217236, + "loss": 3.2051, + "mean_token_accuracy": 0.4252464175224304, + "num_tokens": 2926910030.0, + "step": 5724 + }, + { + "epoch": 1.5481341265548947, + "grad_norm": 1.9765625, + "learning_rate": 0.01655612411775069, + "loss": 3.1767, + "mean_token_accuracy": 0.3865434229373932, + "num_tokens": 2927434292.0, + "step": 5725 + }, + { + "epoch": 1.5484045429962143, + "grad_norm": 2.875, + "learning_rate": 0.016554883822850787, + "loss": 3.233, + "mean_token_accuracy": 0.380535751581192, + "num_tokens": 2927958524.0, + "step": 5726 + }, + { + "epoch": 1.5486749594375337, + "grad_norm": 2.5, + "learning_rate": 0.016553643357510715, + "loss": 3.3084, + "mean_token_accuracy": 0.38870659470558167, + "num_tokens": 2928482661.0, + "step": 5727 + }, + { + "epoch": 1.5489453758788534, + "grad_norm": 2.921875, + "learning_rate": 0.01655240272176853, + "loss": 3.2095, + "mean_token_accuracy": 0.38750120997428894, + "num_tokens": 2929006851.0, + "step": 5728 + }, + { + "epoch": 1.549215792320173, + "grad_norm": 2.765625, + "learning_rate": 0.0165511619156623, + "loss": 3.271, + "mean_token_accuracy": 0.407469779253006, + "num_tokens": 2929388690.0, + "step": 5729 + }, + { + "epoch": 1.5494862087614927, + "grad_norm": 3.0, + "learning_rate": 0.0165499209392301, + "loss": 3.5334, + "mean_token_accuracy": 0.3669964671134949, + "num_tokens": 2929912953.0, + "step": 5730 + }, + { + "epoch": 1.5497566252028123, + "grad_norm": 19.625, + "learning_rate": 0.016548679792510006, + "loss": 15.8678, + "mean_token_accuracy": 0.0, + "num_tokens": 2930394990.0, + "step": 5731 + }, + { + "epoch": 1.550027041644132, + "grad_norm": 6.0625, + "learning_rate": 0.016547438475540098, + "loss": 3.8084, + "mean_token_accuracy": 0.3149784207344055, + "num_tokens": 2930842136.0, + "step": 5732 + }, + { + "epoch": 1.5502974580854516, + "grad_norm": 2.59375, + "learning_rate": 0.01654619698835846, + "loss": 3.4993, + "mean_token_accuracy": 0.3578374981880188, + "num_tokens": 2931314400.0, + "step": 5733 + }, + { + "epoch": 1.5505678745267713, + "grad_norm": 3.453125, + "learning_rate": 0.01654495533100319, + "loss": 3.4395, + "mean_token_accuracy": 0.3891143202781677, + "num_tokens": 2931800635.0, + "step": 5734 + }, + { + "epoch": 1.550838290968091, + "grad_norm": 3.828125, + "learning_rate": 0.016543713503512386, + "loss": 3.4876, + "mean_token_accuracy": 0.3539775609970093, + "num_tokens": 2932324845.0, + "step": 5735 + }, + { + "epoch": 1.5511087074094105, + "grad_norm": 3.375, + "learning_rate": 0.016542471505924142, + "loss": 3.5692, + "mean_token_accuracy": 0.3717142343521118, + "num_tokens": 2932748707.0, + "step": 5736 + }, + { + "epoch": 1.5513791238507302, + "grad_norm": 2.96875, + "learning_rate": 0.01654122933827658, + "loss": 3.3835, + "mean_token_accuracy": 0.37732112407684326, + "num_tokens": 2933261156.0, + "step": 5737 + }, + { + "epoch": 1.5516495402920496, + "grad_norm": 2.390625, + "learning_rate": 0.016539987000607805, + "loss": 3.2762, + "mean_token_accuracy": 0.389365553855896, + "num_tokens": 2933726197.0, + "step": 5738 + }, + { + "epoch": 1.5519199567333692, + "grad_norm": 2.84375, + "learning_rate": 0.016538744492955934, + "loss": 3.4783, + "mean_token_accuracy": 0.36571186780929565, + "num_tokens": 2934250285.0, + "step": 5739 + }, + { + "epoch": 1.5521903731746889, + "grad_norm": 3.859375, + "learning_rate": 0.016537501815359096, + "loss": 3.5704, + "mean_token_accuracy": 0.36613133549690247, + "num_tokens": 2934774539.0, + "step": 5740 + }, + { + "epoch": 1.5524607896160085, + "grad_norm": 3.125, + "learning_rate": 0.01653625896785542, + "loss": 3.0027, + "mean_token_accuracy": 0.38330426812171936, + "num_tokens": 2935298788.0, + "step": 5741 + }, + { + "epoch": 1.5527312060573282, + "grad_norm": 2.765625, + "learning_rate": 0.016535015950483033, + "loss": 3.5366, + "mean_token_accuracy": 0.3691176176071167, + "num_tokens": 2935758730.0, + "step": 5742 + }, + { + "epoch": 1.5530016224986478, + "grad_norm": 3.0, + "learning_rate": 0.01653377276328008, + "loss": 3.282, + "mean_token_accuracy": 0.40269845724105835, + "num_tokens": 2936167501.0, + "step": 5743 + }, + { + "epoch": 1.5532720389399675, + "grad_norm": 2.359375, + "learning_rate": 0.016532529406284704, + "loss": 3.1727, + "mean_token_accuracy": 0.3894001543521881, + "num_tokens": 2936691564.0, + "step": 5744 + }, + { + "epoch": 1.553542455381287, + "grad_norm": 2.609375, + "learning_rate": 0.01653128587953506, + "loss": 3.0317, + "mean_token_accuracy": 0.400858610868454, + "num_tokens": 2937215760.0, + "step": 5745 + }, + { + "epoch": 1.5538128718226067, + "grad_norm": 3.015625, + "learning_rate": 0.016530042183069298, + "loss": 3.0746, + "mean_token_accuracy": 0.4139396548271179, + "num_tokens": 2937739876.0, + "step": 5746 + }, + { + "epoch": 1.5540832882639264, + "grad_norm": 2.984375, + "learning_rate": 0.016528798316925573, + "loss": 3.2987, + "mean_token_accuracy": 0.38776707649230957, + "num_tokens": 2938264016.0, + "step": 5747 + }, + { + "epoch": 1.554353704705246, + "grad_norm": 3.734375, + "learning_rate": 0.016527554281142064, + "loss": 3.3192, + "mean_token_accuracy": 0.35908201336860657, + "num_tokens": 2938788153.0, + "step": 5748 + }, + { + "epoch": 1.5546241211465657, + "grad_norm": 2.921875, + "learning_rate": 0.01652631007575693, + "loss": 3.208, + "mean_token_accuracy": 0.3860990107059479, + "num_tokens": 2939312385.0, + "step": 5749 + }, + { + "epoch": 1.5548945375878853, + "grad_norm": 3.109375, + "learning_rate": 0.016525065700808354, + "loss": 3.1879, + "mean_token_accuracy": 0.38841259479522705, + "num_tokens": 2939836589.0, + "step": 5750 + }, + { + "epoch": 1.555164954029205, + "grad_norm": 134.0, + "learning_rate": 0.01652382115633451, + "loss": 15.8767, + "mean_token_accuracy": 0.023218266665935516, + "num_tokens": 2940360832.0, + "step": 5751 + }, + { + "epoch": 1.5554353704705246, + "grad_norm": 4.03125, + "learning_rate": 0.01652257644237359, + "loss": 3.3299, + "mean_token_accuracy": 0.368024617433548, + "num_tokens": 2940885112.0, + "step": 5752 + }, + { + "epoch": 1.5557057869118442, + "grad_norm": 2.65625, + "learning_rate": 0.016521331558963785, + "loss": 3.4671, + "mean_token_accuracy": 0.36633050441741943, + "num_tokens": 2941409333.0, + "step": 5753 + }, + { + "epoch": 1.555976203353164, + "grad_norm": 3.296875, + "learning_rate": 0.016520086506143287, + "loss": 3.5677, + "mean_token_accuracy": 0.3450127840042114, + "num_tokens": 2941933470.0, + "step": 5754 + }, + { + "epoch": 1.5562466197944835, + "grad_norm": 3.15625, + "learning_rate": 0.016518841283950307, + "loss": 3.4332, + "mean_token_accuracy": 0.3657364249229431, + "num_tokens": 2942439882.0, + "step": 5755 + }, + { + "epoch": 1.5565170362358032, + "grad_norm": 3.21875, + "learning_rate": 0.01651759589242304, + "loss": 3.3182, + "mean_token_accuracy": 0.37533286213874817, + "num_tokens": 2942964143.0, + "step": 5756 + }, + { + "epoch": 1.5567874526771228, + "grad_norm": 2.515625, + "learning_rate": 0.016516350331599712, + "loss": 3.3907, + "mean_token_accuracy": 0.38167238235473633, + "num_tokens": 2943488422.0, + "step": 5757 + }, + { + "epoch": 1.5570578691184425, + "grad_norm": 3.390625, + "learning_rate": 0.01651510460151853, + "loss": 3.4806, + "mean_token_accuracy": 0.3566378355026245, + "num_tokens": 2944012684.0, + "step": 5758 + }, + { + "epoch": 1.557328285559762, + "grad_norm": 2.828125, + "learning_rate": 0.01651385870221772, + "loss": 3.2302, + "mean_token_accuracy": 0.39451077580451965, + "num_tokens": 2944536915.0, + "step": 5759 + }, + { + "epoch": 1.5575987020010817, + "grad_norm": 4.34375, + "learning_rate": 0.01651261263373551, + "loss": 3.3631, + "mean_token_accuracy": 0.36341559886932373, + "num_tokens": 2945061099.0, + "step": 5760 + }, + { + "epoch": 1.5578691184424014, + "grad_norm": 2.78125, + "learning_rate": 0.016511366396110134, + "loss": 3.1077, + "mean_token_accuracy": 0.3899628818035126, + "num_tokens": 2945567549.0, + "step": 5761 + }, + { + "epoch": 1.558139534883721, + "grad_norm": 3.578125, + "learning_rate": 0.016510119989379825, + "loss": 3.531, + "mean_token_accuracy": 0.3544907867908478, + "num_tokens": 2946091808.0, + "step": 5762 + }, + { + "epoch": 1.5584099513250407, + "grad_norm": 3.171875, + "learning_rate": 0.016508873413582835, + "loss": 3.35, + "mean_token_accuracy": 0.3831409215927124, + "num_tokens": 2946616088.0, + "step": 5763 + }, + { + "epoch": 1.5586803677663603, + "grad_norm": 2.78125, + "learning_rate": 0.016507626668757408, + "loss": 3.3801, + "mean_token_accuracy": 0.3880053162574768, + "num_tokens": 2947140339.0, + "step": 5764 + }, + { + "epoch": 1.55895078420768, + "grad_norm": 3.125, + "learning_rate": 0.016506379754941795, + "loss": 3.5409, + "mean_token_accuracy": 0.37361085414886475, + "num_tokens": 2947607885.0, + "step": 5765 + }, + { + "epoch": 1.5592212006489996, + "grad_norm": 3.21875, + "learning_rate": 0.01650513267217426, + "loss": 3.4296, + "mean_token_accuracy": 0.36749371886253357, + "num_tokens": 2948132095.0, + "step": 5766 + }, + { + "epoch": 1.5594916170903192, + "grad_norm": 3.84375, + "learning_rate": 0.016503885420493067, + "loss": 3.3899, + "mean_token_accuracy": 0.3382147550582886, + "num_tokens": 2948656246.0, + "step": 5767 + }, + { + "epoch": 1.5597620335316387, + "grad_norm": 3.015625, + "learning_rate": 0.016502637999936478, + "loss": 3.1969, + "mean_token_accuracy": 0.39218199253082275, + "num_tokens": 2949180480.0, + "step": 5768 + }, + { + "epoch": 1.5600324499729583, + "grad_norm": 4.1875, + "learning_rate": 0.016501390410542775, + "loss": 3.2993, + "mean_token_accuracy": 0.35344743728637695, + "num_tokens": 2949704666.0, + "step": 5769 + }, + { + "epoch": 1.560302866414278, + "grad_norm": 3.0, + "learning_rate": 0.016500142652350235, + "loss": 3.3216, + "mean_token_accuracy": 0.38000866770744324, + "num_tokens": 2950228810.0, + "step": 5770 + }, + { + "epoch": 1.5605732828555976, + "grad_norm": 37.0, + "learning_rate": 0.016498894725397145, + "loss": 10.6622, + "mean_token_accuracy": 0.0001297004200750962, + "num_tokens": 2950752965.0, + "step": 5771 + }, + { + "epoch": 1.5608436992969172, + "grad_norm": 7.59375, + "learning_rate": 0.016497646629721797, + "loss": 3.8404, + "mean_token_accuracy": 0.3116079568862915, + "num_tokens": 2951221590.0, + "step": 5772 + }, + { + "epoch": 1.5611141157382369, + "grad_norm": 2.484375, + "learning_rate": 0.01649639836536248, + "loss": 3.5504, + "mean_token_accuracy": 0.3671860098838806, + "num_tokens": 2951664328.0, + "step": 5773 + }, + { + "epoch": 1.5613845321795565, + "grad_norm": 3.484375, + "learning_rate": 0.016495149932357497, + "loss": 3.5746, + "mean_token_accuracy": 0.36847564578056335, + "num_tokens": 2952134065.0, + "step": 5774 + }, + { + "epoch": 1.5616549486208762, + "grad_norm": 4.78125, + "learning_rate": 0.016493901330745152, + "loss": 3.5945, + "mean_token_accuracy": 0.36503878235816956, + "num_tokens": 2952620189.0, + "step": 5775 + }, + { + "epoch": 1.5619253650621958, + "grad_norm": 2.875, + "learning_rate": 0.016492652560563755, + "loss": 3.1858, + "mean_token_accuracy": 0.3787197768688202, + "num_tokens": 2953144397.0, + "step": 5776 + }, + { + "epoch": 1.5621957815035155, + "grad_norm": 2.015625, + "learning_rate": 0.016491403621851634, + "loss": 3.4273, + "mean_token_accuracy": 0.3662930428981781, + "num_tokens": 2953668567.0, + "step": 5777 + }, + { + "epoch": 1.562466197944835, + "grad_norm": 2.71875, + "learning_rate": 0.016490154514647092, + "loss": 3.2045, + "mean_token_accuracy": 0.3896341323852539, + "num_tokens": 2954192736.0, + "step": 5778 + }, + { + "epoch": 1.5627366143861545, + "grad_norm": 3.015625, + "learning_rate": 0.016488905238988473, + "loss": 3.1784, + "mean_token_accuracy": 0.3701307773590088, + "num_tokens": 2954716960.0, + "step": 5779 + }, + { + "epoch": 1.5630070308274742, + "grad_norm": 2.890625, + "learning_rate": 0.01648765579491409, + "loss": 3.3948, + "mean_token_accuracy": 0.3766332268714905, + "num_tokens": 2955241118.0, + "step": 5780 + }, + { + "epoch": 1.5632774472687938, + "grad_norm": 2.53125, + "learning_rate": 0.016486406182462294, + "loss": 3.2952, + "mean_token_accuracy": 0.3735550045967102, + "num_tokens": 2955758607.0, + "step": 5781 + }, + { + "epoch": 1.5635478637101135, + "grad_norm": 1.8515625, + "learning_rate": 0.01648515640167142, + "loss": 3.1563, + "mean_token_accuracy": 0.3796390891075134, + "num_tokens": 2956282855.0, + "step": 5782 + }, + { + "epoch": 1.563818280151433, + "grad_norm": 2.015625, + "learning_rate": 0.01648390645257982, + "loss": 3.1308, + "mean_token_accuracy": 0.39907705783843994, + "num_tokens": 2956807120.0, + "step": 5783 + }, + { + "epoch": 1.5640886965927527, + "grad_norm": 2.640625, + "learning_rate": 0.01648265633522584, + "loss": 3.4172, + "mean_token_accuracy": 0.38344326615333557, + "num_tokens": 2957331221.0, + "step": 5784 + }, + { + "epoch": 1.5643591130340724, + "grad_norm": 2.609375, + "learning_rate": 0.016481406049647842, + "loss": 3.2019, + "mean_token_accuracy": 0.3776198625564575, + "num_tokens": 2957855430.0, + "step": 5785 + }, + { + "epoch": 1.564629529475392, + "grad_norm": 2.890625, + "learning_rate": 0.016480155595884186, + "loss": 3.2138, + "mean_token_accuracy": 0.3822671175003052, + "num_tokens": 2958379693.0, + "step": 5786 + }, + { + "epoch": 1.5648999459167117, + "grad_norm": 3.171875, + "learning_rate": 0.016478904973973238, + "loss": 3.2993, + "mean_token_accuracy": 0.38687312602996826, + "num_tokens": 2958903729.0, + "step": 5787 + }, + { + "epoch": 1.5651703623580313, + "grad_norm": 4.15625, + "learning_rate": 0.016477654183953375, + "loss": 3.3292, + "mean_token_accuracy": 0.38755446672439575, + "num_tokens": 2959415846.0, + "step": 5788 + }, + { + "epoch": 1.565440778799351, + "grad_norm": 2.75, + "learning_rate": 0.01647640322586297, + "loss": 3.0429, + "mean_token_accuracy": 0.4034304618835449, + "num_tokens": 2959849726.0, + "step": 5789 + }, + { + "epoch": 1.5657111952406706, + "grad_norm": 3.609375, + "learning_rate": 0.016475152099740417, + "loss": 3.3658, + "mean_token_accuracy": 0.3890445828437805, + "num_tokens": 2960373925.0, + "step": 5790 + }, + { + "epoch": 1.5659816116819902, + "grad_norm": 84.5, + "learning_rate": 0.016473900805624088, + "loss": 16.7392, + "mean_token_accuracy": 0.02155298925936222, + "num_tokens": 2960897970.0, + "step": 5791 + }, + { + "epoch": 1.5662520281233099, + "grad_norm": 8.6875, + "learning_rate": 0.016472649343552387, + "loss": 4.094, + "mean_token_accuracy": 0.2710106372833252, + "num_tokens": 2961392238.0, + "step": 5792 + }, + { + "epoch": 1.5665224445646295, + "grad_norm": 3.21875, + "learning_rate": 0.016471397713563714, + "loss": 3.6834, + "mean_token_accuracy": 0.3453250527381897, + "num_tokens": 2961916449.0, + "step": 5793 + }, + { + "epoch": 1.5667928610059492, + "grad_norm": 2.796875, + "learning_rate": 0.016470145915696465, + "loss": 3.3136, + "mean_token_accuracy": 0.36544597148895264, + "num_tokens": 2962440716.0, + "step": 5794 + }, + { + "epoch": 1.5670632774472688, + "grad_norm": 2.9375, + "learning_rate": 0.01646889394998905, + "loss": 3.4844, + "mean_token_accuracy": 0.3658849596977234, + "num_tokens": 2962945399.0, + "step": 5795 + }, + { + "epoch": 1.5673336938885885, + "grad_norm": 3.09375, + "learning_rate": 0.01646764181647989, + "loss": 3.1848, + "mean_token_accuracy": 0.38493236899375916, + "num_tokens": 2963469672.0, + "step": 5796 + }, + { + "epoch": 1.567604110329908, + "grad_norm": 3.34375, + "learning_rate": 0.0164663895152074, + "loss": 3.4347, + "mean_token_accuracy": 0.3830614686012268, + "num_tokens": 2963978348.0, + "step": 5797 + }, + { + "epoch": 1.5678745267712277, + "grad_norm": 3.28125, + "learning_rate": 0.01646513704621, + "loss": 3.2721, + "mean_token_accuracy": 0.37088102102279663, + "num_tokens": 2964502594.0, + "step": 5798 + }, + { + "epoch": 1.5681449432125474, + "grad_norm": 4.09375, + "learning_rate": 0.01646388440952613, + "loss": 3.2375, + "mean_token_accuracy": 0.3861519992351532, + "num_tokens": 2965026857.0, + "step": 5799 + }, + { + "epoch": 1.568415359653867, + "grad_norm": 3.015625, + "learning_rate": 0.016462631605194213, + "loss": 3.1128, + "mean_token_accuracy": 0.3903258144855499, + "num_tokens": 2965536460.0, + "step": 5800 + }, + { + "epoch": 1.5686857760951867, + "grad_norm": 2.859375, + "learning_rate": 0.016461378633252696, + "loss": 3.5183, + "mean_token_accuracy": 0.3591153621673584, + "num_tokens": 2966060720.0, + "step": 5801 + }, + { + "epoch": 1.5689561925365063, + "grad_norm": 2.953125, + "learning_rate": 0.01646012549374002, + "loss": 3.4847, + "mean_token_accuracy": 0.35999566316604614, + "num_tokens": 2966584978.0, + "step": 5802 + }, + { + "epoch": 1.569226608977826, + "grad_norm": 2.515625, + "learning_rate": 0.016458872186694635, + "loss": 3.2072, + "mean_token_accuracy": 0.41518300771713257, + "num_tokens": 2967101154.0, + "step": 5803 + }, + { + "epoch": 1.5694970254191456, + "grad_norm": 2.828125, + "learning_rate": 0.016457618712155002, + "loss": 3.255, + "mean_token_accuracy": 0.38554397225379944, + "num_tokens": 2967625432.0, + "step": 5804 + }, + { + "epoch": 1.5697674418604652, + "grad_norm": 3.484375, + "learning_rate": 0.016456365070159577, + "loss": 3.3718, + "mean_token_accuracy": 0.38091176748275757, + "num_tokens": 2968149659.0, + "step": 5805 + }, + { + "epoch": 1.5700378583017849, + "grad_norm": 2.515625, + "learning_rate": 0.01645511126074682, + "loss": 3.2776, + "mean_token_accuracy": 0.36128056049346924, + "num_tokens": 2968673790.0, + "step": 5806 + }, + { + "epoch": 1.5703082747431045, + "grad_norm": 2.390625, + "learning_rate": 0.016453857283955212, + "loss": 2.9877, + "mean_token_accuracy": 0.4054589569568634, + "num_tokens": 2969198032.0, + "step": 5807 + }, + { + "epoch": 1.5705786911844242, + "grad_norm": 2.09375, + "learning_rate": 0.016452603139823222, + "loss": 3.359, + "mean_token_accuracy": 0.4009987711906433, + "num_tokens": 2969640626.0, + "step": 5808 + }, + { + "epoch": 1.5708491076257436, + "grad_norm": 2.71875, + "learning_rate": 0.016451348828389334, + "loss": 3.1447, + "mean_token_accuracy": 0.3936309814453125, + "num_tokens": 2970164681.0, + "step": 5809 + }, + { + "epoch": 1.5711195240670632, + "grad_norm": 2.953125, + "learning_rate": 0.016450094349692034, + "loss": 3.3105, + "mean_token_accuracy": 0.38543015718460083, + "num_tokens": 2970688892.0, + "step": 5810 + }, + { + "epoch": 1.5713899405083829, + "grad_norm": 624.0, + "learning_rate": 0.016448839703769806, + "loss": 31.0332, + "mean_token_accuracy": 4.6622932131867856e-05, + "num_tokens": 2971213172.0, + "step": 5811 + }, + { + "epoch": 1.5716603569497025, + "grad_norm": 8.1875, + "learning_rate": 0.016447584890661157, + "loss": 3.7222, + "mean_token_accuracy": 0.3218311667442322, + "num_tokens": 2971737426.0, + "step": 5812 + }, + { + "epoch": 1.5719307733910222, + "grad_norm": 2.6875, + "learning_rate": 0.016446329910404584, + "loss": 3.5714, + "mean_token_accuracy": 0.3585274815559387, + "num_tokens": 2972252576.0, + "step": 5813 + }, + { + "epoch": 1.5722011898323418, + "grad_norm": 2.890625, + "learning_rate": 0.016445074763038588, + "loss": 3.2595, + "mean_token_accuracy": 0.38500046730041504, + "num_tokens": 2972776660.0, + "step": 5814 + }, + { + "epoch": 1.5724716062736614, + "grad_norm": 3.4375, + "learning_rate": 0.016443819448601685, + "loss": 3.5141, + "mean_token_accuracy": 0.36079642176628113, + "num_tokens": 2973300821.0, + "step": 5815 + }, + { + "epoch": 1.572742022714981, + "grad_norm": 3.0, + "learning_rate": 0.016442563967132395, + "loss": 3.3481, + "mean_token_accuracy": 0.3848302364349365, + "num_tokens": 2973825053.0, + "step": 5816 + }, + { + "epoch": 1.5730124391563007, + "grad_norm": 2.5625, + "learning_rate": 0.01644130831866924, + "loss": 3.2532, + "mean_token_accuracy": 0.386901319026947, + "num_tokens": 2974318030.0, + "step": 5817 + }, + { + "epoch": 1.5732828555976204, + "grad_norm": 3.03125, + "learning_rate": 0.016440052503250736, + "loss": 3.421, + "mean_token_accuracy": 0.3938066363334656, + "num_tokens": 2974842217.0, + "step": 5818 + }, + { + "epoch": 1.57355327203894, + "grad_norm": 3.8125, + "learning_rate": 0.01643879652091543, + "loss": 3.3371, + "mean_token_accuracy": 0.31540125608444214, + "num_tokens": 2975366490.0, + "step": 5819 + }, + { + "epoch": 1.5738236884802594, + "grad_norm": 2.28125, + "learning_rate": 0.01643754037170185, + "loss": 3.2972, + "mean_token_accuracy": 0.3823733329772949, + "num_tokens": 2975890621.0, + "step": 5820 + }, + { + "epoch": 1.574094104921579, + "grad_norm": 3.203125, + "learning_rate": 0.01643628405564854, + "loss": 3.3929, + "mean_token_accuracy": 0.3785489797592163, + "num_tokens": 2976407172.0, + "step": 5821 + }, + { + "epoch": 1.5743645213628987, + "grad_norm": 2.875, + "learning_rate": 0.01643502757279405, + "loss": 3.367, + "mean_token_accuracy": 0.36981451511383057, + "num_tokens": 2976873140.0, + "step": 5822 + }, + { + "epoch": 1.5746349378042184, + "grad_norm": 3.671875, + "learning_rate": 0.01643377092317693, + "loss": 3.2289, + "mean_token_accuracy": 0.3668769299983978, + "num_tokens": 2977397289.0, + "step": 5823 + }, + { + "epoch": 1.574905354245538, + "grad_norm": 2.734375, + "learning_rate": 0.016432514106835737, + "loss": 3.1247, + "mean_token_accuracy": 0.39086979627609253, + "num_tokens": 2977921466.0, + "step": 5824 + }, + { + "epoch": 1.5751757706868577, + "grad_norm": 2.78125, + "learning_rate": 0.016431257123809037, + "loss": 3.1811, + "mean_token_accuracy": 0.376113623380661, + "num_tokens": 2978445679.0, + "step": 5825 + }, + { + "epoch": 1.5754461871281773, + "grad_norm": 3.015625, + "learning_rate": 0.016429999974135395, + "loss": 3.2945, + "mean_token_accuracy": 0.4001314640045166, + "num_tokens": 2978969921.0, + "step": 5826 + }, + { + "epoch": 1.575716603569497, + "grad_norm": 2.234375, + "learning_rate": 0.016428742657853383, + "loss": 2.9795, + "mean_token_accuracy": 0.39466503262519836, + "num_tokens": 2979494003.0, + "step": 5827 + }, + { + "epoch": 1.5759870200108166, + "grad_norm": 2.8125, + "learning_rate": 0.016427485175001585, + "loss": 3.0578, + "mean_token_accuracy": 0.3892800807952881, + "num_tokens": 2979986649.0, + "step": 5828 + }, + { + "epoch": 1.5762574364521362, + "grad_norm": 2.328125, + "learning_rate": 0.01642622752561858, + "loss": 3.3295, + "mean_token_accuracy": 0.377101868391037, + "num_tokens": 2980510921.0, + "step": 5829 + }, + { + "epoch": 1.5765278528934559, + "grad_norm": 2.390625, + "learning_rate": 0.016424969709742956, + "loss": 3.2064, + "mean_token_accuracy": 0.3827810287475586, + "num_tokens": 2981035175.0, + "step": 5830 + }, + { + "epoch": 1.5767982693347755, + "grad_norm": 322.0, + "learning_rate": 0.016423711727413307, + "loss": 28.3453, + "mean_token_accuracy": 0.0003894803812727332, + "num_tokens": 2981559354.0, + "step": 5831 + }, + { + "epoch": 1.5770686857760952, + "grad_norm": 7.8125, + "learning_rate": 0.016422453578668235, + "loss": 3.75, + "mean_token_accuracy": 0.34986963868141174, + "num_tokens": 2982083590.0, + "step": 5832 + }, + { + "epoch": 1.5773391022174148, + "grad_norm": 2.515625, + "learning_rate": 0.016421195263546337, + "loss": 3.2338, + "mean_token_accuracy": 0.39865753054618835, + "num_tokens": 2982607775.0, + "step": 5833 + }, + { + "epoch": 1.5776095186587344, + "grad_norm": 2.75, + "learning_rate": 0.01641993678208623, + "loss": 3.439, + "mean_token_accuracy": 0.3498760759830475, + "num_tokens": 2983132035.0, + "step": 5834 + }, + { + "epoch": 1.577879935100054, + "grad_norm": 2.515625, + "learning_rate": 0.016418678134326518, + "loss": 3.2197, + "mean_token_accuracy": 0.38512951135635376, + "num_tokens": 2983656209.0, + "step": 5835 + }, + { + "epoch": 1.5781503515413737, + "grad_norm": 2.953125, + "learning_rate": 0.01641741932030583, + "loss": 3.2673, + "mean_token_accuracy": 0.3794361650943756, + "num_tokens": 2984180238.0, + "step": 5836 + }, + { + "epoch": 1.5784207679826934, + "grad_norm": 2.953125, + "learning_rate": 0.016416160340062786, + "loss": 3.3479, + "mean_token_accuracy": 0.3839966058731079, + "num_tokens": 2984704476.0, + "step": 5837 + }, + { + "epoch": 1.578691184424013, + "grad_norm": 3.140625, + "learning_rate": 0.016414901193636013, + "loss": 3.4299, + "mean_token_accuracy": 0.37938979268074036, + "num_tokens": 2985228663.0, + "step": 5838 + }, + { + "epoch": 1.5789616008653327, + "grad_norm": 3.171875, + "learning_rate": 0.01641364188106415, + "loss": 3.2862, + "mean_token_accuracy": 0.38324522972106934, + "num_tokens": 2985699199.0, + "step": 5839 + }, + { + "epoch": 1.5792320173066523, + "grad_norm": 3.234375, + "learning_rate": 0.016412382402385826, + "loss": 3.4466, + "mean_token_accuracy": 0.37623661756515503, + "num_tokens": 2986223295.0, + "step": 5840 + }, + { + "epoch": 1.579502433747972, + "grad_norm": 2.75, + "learning_rate": 0.016411122757639698, + "loss": 3.0503, + "mean_token_accuracy": 0.39515674114227295, + "num_tokens": 2986747535.0, + "step": 5841 + }, + { + "epoch": 1.5797728501892916, + "grad_norm": 2.5, + "learning_rate": 0.01640986294686441, + "loss": 3.3954, + "mean_token_accuracy": 0.3410092890262604, + "num_tokens": 2987271810.0, + "step": 5842 + }, + { + "epoch": 1.5800432666306112, + "grad_norm": 2.96875, + "learning_rate": 0.01640860297009862, + "loss": 3.2874, + "mean_token_accuracy": 0.37765395641326904, + "num_tokens": 2987757164.0, + "step": 5843 + }, + { + "epoch": 1.5803136830719309, + "grad_norm": 2.34375, + "learning_rate": 0.01640734282738098, + "loss": 2.9712, + "mean_token_accuracy": 0.4361361861228943, + "num_tokens": 2988281440.0, + "step": 5844 + }, + { + "epoch": 1.5805840995132505, + "grad_norm": 2.34375, + "learning_rate": 0.016406082518750163, + "loss": 3.3496, + "mean_token_accuracy": 0.3855191469192505, + "num_tokens": 2988805710.0, + "step": 5845 + }, + { + "epoch": 1.5808545159545702, + "grad_norm": 5.65625, + "learning_rate": 0.01640482204424484, + "loss": 2.9098, + "mean_token_accuracy": 0.41722601652145386, + "num_tokens": 2989299373.0, + "step": 5846 + }, + { + "epoch": 1.5811249323958898, + "grad_norm": 2.578125, + "learning_rate": 0.01640356140390368, + "loss": 3.3953, + "mean_token_accuracy": 0.3707040548324585, + "num_tokens": 2989781885.0, + "step": 5847 + }, + { + "epoch": 1.5813953488372094, + "grad_norm": 3.703125, + "learning_rate": 0.01640230059776536, + "loss": 3.1701, + "mean_token_accuracy": 0.41185262799263, + "num_tokens": 2990305966.0, + "step": 5848 + }, + { + "epoch": 1.581665765278529, + "grad_norm": 2.734375, + "learning_rate": 0.01640103962586858, + "loss": 3.3509, + "mean_token_accuracy": 0.3795434832572937, + "num_tokens": 2990830249.0, + "step": 5849 + }, + { + "epoch": 1.5819361817198487, + "grad_norm": 3.65625, + "learning_rate": 0.016399778488252012, + "loss": 3.269, + "mean_token_accuracy": 0.3799598217010498, + "num_tokens": 2991336020.0, + "step": 5850 + }, + { + "epoch": 1.5822065981611682, + "grad_norm": 96.0, + "learning_rate": 0.016398517184954362, + "loss": 13.0639, + "mean_token_accuracy": 0.005816641263663769, + "num_tokens": 2991805876.0, + "step": 5851 + }, + { + "epoch": 1.5824770146024878, + "grad_norm": 7.375, + "learning_rate": 0.01639725571601433, + "loss": 4.031, + "mean_token_accuracy": 0.29369693994522095, + "num_tokens": 2992330052.0, + "step": 5852 + }, + { + "epoch": 1.5827474310438074, + "grad_norm": 2.484375, + "learning_rate": 0.016395994081470622, + "loss": 3.4138, + "mean_token_accuracy": 0.36452823877334595, + "num_tokens": 2992854277.0, + "step": 5853 + }, + { + "epoch": 1.583017847485127, + "grad_norm": 3.09375, + "learning_rate": 0.016394732281361946, + "loss": 3.2492, + "mean_token_accuracy": 0.3633131980895996, + "num_tokens": 2993378542.0, + "step": 5854 + }, + { + "epoch": 1.5832882639264467, + "grad_norm": 3.0625, + "learning_rate": 0.016393470315727018, + "loss": 3.3093, + "mean_token_accuracy": 0.3795439600944519, + "num_tokens": 2993902773.0, + "step": 5855 + }, + { + "epoch": 1.5835586803677664, + "grad_norm": 2.953125, + "learning_rate": 0.01639220818460456, + "loss": 3.3629, + "mean_token_accuracy": 0.3858698010444641, + "num_tokens": 2994416587.0, + "step": 5856 + }, + { + "epoch": 1.583829096809086, + "grad_norm": 4.25, + "learning_rate": 0.0163909458880333, + "loss": 3.3968, + "mean_token_accuracy": 0.3731660842895508, + "num_tokens": 2994940732.0, + "step": 5857 + }, + { + "epoch": 1.5840995132504057, + "grad_norm": 3.21875, + "learning_rate": 0.016389683426051963, + "loss": 3.4483, + "mean_token_accuracy": 0.3940216302871704, + "num_tokens": 2995464931.0, + "step": 5858 + }, + { + "epoch": 1.5843699296917253, + "grad_norm": 3.15625, + "learning_rate": 0.016388420798699288, + "loss": 3.4184, + "mean_token_accuracy": 0.37592145800590515, + "num_tokens": 2995957271.0, + "step": 5859 + }, + { + "epoch": 1.584640346133045, + "grad_norm": 2.921875, + "learning_rate": 0.016387158006014015, + "loss": 3.215, + "mean_token_accuracy": 0.38187313079833984, + "num_tokens": 2996481446.0, + "step": 5860 + }, + { + "epoch": 1.5849107625743644, + "grad_norm": 2.328125, + "learning_rate": 0.016385895048034897, + "loss": 3.4058, + "mean_token_accuracy": 0.38346752524375916, + "num_tokens": 2997005581.0, + "step": 5861 + }, + { + "epoch": 1.585181179015684, + "grad_norm": 2.890625, + "learning_rate": 0.01638463192480068, + "loss": 3.1977, + "mean_token_accuracy": 0.3886939585208893, + "num_tokens": 2997529827.0, + "step": 5862 + }, + { + "epoch": 1.5854515954570036, + "grad_norm": 2.6875, + "learning_rate": 0.016383368636350114, + "loss": 3.2895, + "mean_token_accuracy": 0.3673856854438782, + "num_tokens": 2998054059.0, + "step": 5863 + }, + { + "epoch": 1.5857220118983233, + "grad_norm": 3.296875, + "learning_rate": 0.01638210518272197, + "loss": 3.4087, + "mean_token_accuracy": 0.36103686690330505, + "num_tokens": 2998578310.0, + "step": 5864 + }, + { + "epoch": 1.585992428339643, + "grad_norm": 2.875, + "learning_rate": 0.016380841563955015, + "loss": 3.4616, + "mean_token_accuracy": 0.38624370098114014, + "num_tokens": 2999102420.0, + "step": 5865 + }, + { + "epoch": 1.5862628447809626, + "grad_norm": 3.609375, + "learning_rate": 0.016379577780088014, + "loss": 3.4722, + "mean_token_accuracy": 0.35759174823760986, + "num_tokens": 2999626564.0, + "step": 5866 + }, + { + "epoch": 1.5865332612222822, + "grad_norm": 2.421875, + "learning_rate": 0.016378313831159747, + "loss": 3.2436, + "mean_token_accuracy": 0.4007090926170349, + "num_tokens": 3000150800.0, + "step": 5867 + }, + { + "epoch": 1.5868036776636019, + "grad_norm": 3.46875, + "learning_rate": 0.01637704971720899, + "loss": 3.1214, + "mean_token_accuracy": 0.3848063349723816, + "num_tokens": 3000611949.0, + "step": 5868 + }, + { + "epoch": 1.5870740941049215, + "grad_norm": 2.640625, + "learning_rate": 0.016375785438274542, + "loss": 3.2745, + "mean_token_accuracy": 0.387146532535553, + "num_tokens": 3001100393.0, + "step": 5869 + }, + { + "epoch": 1.5873445105462411, + "grad_norm": 2.796875, + "learning_rate": 0.016374520994395184, + "loss": 3.3594, + "mean_token_accuracy": 0.40113699436187744, + "num_tokens": 3001604367.0, + "step": 5870 + }, + { + "epoch": 1.5876149269875608, + "grad_norm": 4.4375, + "learning_rate": 0.016373256385609717, + "loss": 9.8261, + "mean_token_accuracy": 0.006080190651118755, + "num_tokens": 3002085952.0, + "step": 5871 + }, + { + "epoch": 1.5878853434288804, + "grad_norm": 6.53125, + "learning_rate": 0.016371991611956946, + "loss": 3.9007, + "mean_token_accuracy": 0.2910096347332001, + "num_tokens": 3002610212.0, + "step": 5872 + }, + { + "epoch": 1.5881557598702, + "grad_norm": 2.296875, + "learning_rate": 0.01637072667347567, + "loss": 3.2643, + "mean_token_accuracy": 0.37759822607040405, + "num_tokens": 3003134332.0, + "step": 5873 + }, + { + "epoch": 1.5884261763115197, + "grad_norm": 3.09375, + "learning_rate": 0.016369461570204705, + "loss": 3.2888, + "mean_token_accuracy": 0.38505327701568604, + "num_tokens": 3003618471.0, + "step": 5874 + }, + { + "epoch": 1.5886965927528394, + "grad_norm": 3.375, + "learning_rate": 0.01636819630218287, + "loss": 3.4071, + "mean_token_accuracy": 0.3542296290397644, + "num_tokens": 3004142680.0, + "step": 5875 + }, + { + "epoch": 1.588967009194159, + "grad_norm": 2.78125, + "learning_rate": 0.016366930869448987, + "loss": 3.4054, + "mean_token_accuracy": 0.37298110127449036, + "num_tokens": 3004666880.0, + "step": 5876 + }, + { + "epoch": 1.5892374256354787, + "grad_norm": 2.875, + "learning_rate": 0.01636566527204188, + "loss": 3.3581, + "mean_token_accuracy": 0.3709879517555237, + "num_tokens": 3005191080.0, + "step": 5877 + }, + { + "epoch": 1.5895078420767983, + "grad_norm": 11.5, + "learning_rate": 0.016364399510000384, + "loss": 3.0508, + "mean_token_accuracy": 0.4331287145614624, + "num_tokens": 3005701812.0, + "step": 5878 + }, + { + "epoch": 1.589778258518118, + "grad_norm": 2.09375, + "learning_rate": 0.016363133583363335, + "loss": 3.3442, + "mean_token_accuracy": 0.39534780383110046, + "num_tokens": 3006164146.0, + "step": 5879 + }, + { + "epoch": 1.5900486749594376, + "grad_norm": 2.234375, + "learning_rate": 0.01636186749216957, + "loss": 3.487, + "mean_token_accuracy": 0.36860737204551697, + "num_tokens": 3006688298.0, + "step": 5880 + }, + { + "epoch": 1.5903190914007572, + "grad_norm": 3.703125, + "learning_rate": 0.01636060123645795, + "loss": 3.3558, + "mean_token_accuracy": 0.3803441524505615, + "num_tokens": 3007210263.0, + "step": 5881 + }, + { + "epoch": 1.5905895078420769, + "grad_norm": 2.484375, + "learning_rate": 0.01635933481626732, + "loss": 3.3585, + "mean_token_accuracy": 0.3715883493423462, + "num_tokens": 3007734436.0, + "step": 5882 + }, + { + "epoch": 1.5908599242833965, + "grad_norm": 3.46875, + "learning_rate": 0.016358068231636533, + "loss": 3.3814, + "mean_token_accuracy": 0.37539875507354736, + "num_tokens": 3008258526.0, + "step": 5883 + }, + { + "epoch": 1.5911303407247162, + "grad_norm": 3.15625, + "learning_rate": 0.016356801482604454, + "loss": 3.2719, + "mean_token_accuracy": 0.3968465328216553, + "num_tokens": 3008782802.0, + "step": 5884 + }, + { + "epoch": 1.5914007571660358, + "grad_norm": 3.6875, + "learning_rate": 0.016355534569209955, + "loss": 3.1657, + "mean_token_accuracy": 0.3686543107032776, + "num_tokens": 3009306974.0, + "step": 5885 + }, + { + "epoch": 1.5916711736073554, + "grad_norm": 2.625, + "learning_rate": 0.01635426749149191, + "loss": 3.3649, + "mean_token_accuracy": 0.3695502281188965, + "num_tokens": 3009831146.0, + "step": 5886 + }, + { + "epoch": 1.591941590048675, + "grad_norm": 3.71875, + "learning_rate": 0.016353000249489186, + "loss": 3.4004, + "mean_token_accuracy": 0.3718014359474182, + "num_tokens": 3010315718.0, + "step": 5887 + }, + { + "epoch": 1.5922120064899947, + "grad_norm": 3.3125, + "learning_rate": 0.016351732843240676, + "loss": 3.3967, + "mean_token_accuracy": 0.3732876181602478, + "num_tokens": 3010839914.0, + "step": 5888 + }, + { + "epoch": 1.5924824229313144, + "grad_norm": 2.890625, + "learning_rate": 0.01635046527278526, + "loss": 3.2576, + "mean_token_accuracy": 0.3741346001625061, + "num_tokens": 3011322944.0, + "step": 5889 + }, + { + "epoch": 1.592752839372634, + "grad_norm": 2.5, + "learning_rate": 0.016349197538161837, + "loss": 3.1638, + "mean_token_accuracy": 0.4041042625904083, + "num_tokens": 3011832157.0, + "step": 5890 + }, + { + "epoch": 1.5930232558139537, + "grad_norm": 30.5, + "learning_rate": 0.016347929639409304, + "loss": 12.6974, + "mean_token_accuracy": 0.00936807505786419, + "num_tokens": 3012336991.0, + "step": 5891 + }, + { + "epoch": 1.593293672255273, + "grad_norm": 9.5, + "learning_rate": 0.01634666157656656, + "loss": 3.7599, + "mean_token_accuracy": 0.3388359248638153, + "num_tokens": 3012859024.0, + "step": 5892 + }, + { + "epoch": 1.5935640886965927, + "grad_norm": 3.375, + "learning_rate": 0.016345393349672516, + "loss": 3.3315, + "mean_token_accuracy": 0.3693690299987793, + "num_tokens": 3013383282.0, + "step": 5893 + }, + { + "epoch": 1.5938345051379124, + "grad_norm": 3.078125, + "learning_rate": 0.016344124958766085, + "loss": 3.3706, + "mean_token_accuracy": 0.3828818202018738, + "num_tokens": 3013907440.0, + "step": 5894 + }, + { + "epoch": 1.594104921579232, + "grad_norm": 3.21875, + "learning_rate": 0.016342856403886184, + "loss": 3.3653, + "mean_token_accuracy": 0.37830162048339844, + "num_tokens": 3014396727.0, + "step": 5895 + }, + { + "epoch": 1.5943753380205516, + "grad_norm": 2.875, + "learning_rate": 0.016341587685071733, + "loss": 3.4368, + "mean_token_accuracy": 0.36635446548461914, + "num_tokens": 3014920906.0, + "step": 5896 + }, + { + "epoch": 1.5946457544618713, + "grad_norm": 2.796875, + "learning_rate": 0.01634031880236167, + "loss": 3.2367, + "mean_token_accuracy": 0.3906739056110382, + "num_tokens": 3015445151.0, + "step": 5897 + }, + { + "epoch": 1.594916170903191, + "grad_norm": 2.859375, + "learning_rate": 0.016339049755794918, + "loss": 3.2048, + "mean_token_accuracy": 0.3888097405433655, + "num_tokens": 3015969353.0, + "step": 5898 + }, + { + "epoch": 1.5951865873445106, + "grad_norm": 2.5, + "learning_rate": 0.016337780545410418, + "loss": 3.391, + "mean_token_accuracy": 0.3746054470539093, + "num_tokens": 3016493524.0, + "step": 5899 + }, + { + "epoch": 1.5954570037858302, + "grad_norm": 2.703125, + "learning_rate": 0.01633651117124711, + "loss": 3.44, + "mean_token_accuracy": 0.37844395637512207, + "num_tokens": 3017017799.0, + "step": 5900 + }, + { + "epoch": 1.5957274202271499, + "grad_norm": 3.40625, + "learning_rate": 0.01633524163334395, + "loss": 3.1875, + "mean_token_accuracy": 0.36683082580566406, + "num_tokens": 3017542072.0, + "step": 5901 + }, + { + "epoch": 1.5959978366684693, + "grad_norm": 3.625, + "learning_rate": 0.016333971931739886, + "loss": 3.2652, + "mean_token_accuracy": 0.3720252513885498, + "num_tokens": 3018066274.0, + "step": 5902 + }, + { + "epoch": 1.596268253109789, + "grad_norm": 2.734375, + "learning_rate": 0.016332702066473876, + "loss": 3.1772, + "mean_token_accuracy": 0.38816243410110474, + "num_tokens": 3018586228.0, + "step": 5903 + }, + { + "epoch": 1.5965386695511086, + "grad_norm": 2.859375, + "learning_rate": 0.016331432037584883, + "loss": 3.0965, + "mean_token_accuracy": 0.4055802524089813, + "num_tokens": 3019110485.0, + "step": 5904 + }, + { + "epoch": 1.5968090859924282, + "grad_norm": 2.140625, + "learning_rate": 0.016330161845111883, + "loss": 3.243, + "mean_token_accuracy": 0.37679508328437805, + "num_tokens": 3019634712.0, + "step": 5905 + }, + { + "epoch": 1.5970795024337479, + "grad_norm": 2.5625, + "learning_rate": 0.016328891489093837, + "loss": 3.1469, + "mean_token_accuracy": 0.37789103388786316, + "num_tokens": 3020158982.0, + "step": 5906 + }, + { + "epoch": 1.5973499188750675, + "grad_norm": 3.171875, + "learning_rate": 0.01632762096956973, + "loss": 3.3351, + "mean_token_accuracy": 0.39013394713401794, + "num_tokens": 3020653980.0, + "step": 5907 + }, + { + "epoch": 1.5976203353163871, + "grad_norm": 3.890625, + "learning_rate": 0.016326350286578546, + "loss": 3.5276, + "mean_token_accuracy": 0.38041287660598755, + "num_tokens": 3021134890.0, + "step": 5908 + }, + { + "epoch": 1.5978907517577068, + "grad_norm": 2.453125, + "learning_rate": 0.016325079440159266, + "loss": 3.1807, + "mean_token_accuracy": 0.3851916491985321, + "num_tokens": 3021659115.0, + "step": 5909 + }, + { + "epoch": 1.5981611681990264, + "grad_norm": 2.796875, + "learning_rate": 0.01632380843035089, + "loss": 3.1489, + "mean_token_accuracy": 0.3974533677101135, + "num_tokens": 3022183376.0, + "step": 5910 + }, + { + "epoch": 1.598431584640346, + "grad_norm": 2.09375, + "learning_rate": 0.01632253725719242, + "loss": 10.3575, + "mean_token_accuracy": 2.9081263619445963e-06, + "num_tokens": 3022707620.0, + "step": 5911 + }, + { + "epoch": 1.5987020010816657, + "grad_norm": 7.6875, + "learning_rate": 0.01632126592072285, + "loss": 3.9095, + "mean_token_accuracy": 0.3114927411079407, + "num_tokens": 3023227396.0, + "step": 5912 + }, + { + "epoch": 1.5989724175229854, + "grad_norm": 2.3125, + "learning_rate": 0.016319994420981195, + "loss": 3.2865, + "mean_token_accuracy": 0.3672078847885132, + "num_tokens": 3023751489.0, + "step": 5913 + }, + { + "epoch": 1.599242833964305, + "grad_norm": 3.34375, + "learning_rate": 0.016318722758006464, + "loss": 3.4804, + "mean_token_accuracy": 0.3395078778266907, + "num_tokens": 3024275630.0, + "step": 5914 + }, + { + "epoch": 1.5995132504056246, + "grad_norm": 3.28125, + "learning_rate": 0.016317450931837677, + "loss": 3.0591, + "mean_token_accuracy": 0.39843446016311646, + "num_tokens": 3024799896.0, + "step": 5915 + }, + { + "epoch": 1.5997836668469443, + "grad_norm": 3.078125, + "learning_rate": 0.016316178942513863, + "loss": 3.0378, + "mean_token_accuracy": 0.384996622800827, + "num_tokens": 3025324128.0, + "step": 5916 + }, + { + "epoch": 1.600054083288264, + "grad_norm": 3.359375, + "learning_rate": 0.01631490679007404, + "loss": 3.2096, + "mean_token_accuracy": 0.3809989094734192, + "num_tokens": 3025799777.0, + "step": 5917 + }, + { + "epoch": 1.6003244997295836, + "grad_norm": 2.078125, + "learning_rate": 0.016313634474557248, + "loss": 3.3627, + "mean_token_accuracy": 0.3637571930885315, + "num_tokens": 3026324035.0, + "step": 5918 + }, + { + "epoch": 1.6005949161709032, + "grad_norm": 3.0625, + "learning_rate": 0.016312361996002524, + "loss": 3.4644, + "mean_token_accuracy": 0.36623111367225647, + "num_tokens": 3026848259.0, + "step": 5919 + }, + { + "epoch": 1.6008653326122229, + "grad_norm": 3.9375, + "learning_rate": 0.016311089354448912, + "loss": 3.4376, + "mean_token_accuracy": 0.38185346126556396, + "num_tokens": 3027372527.0, + "step": 5920 + }, + { + "epoch": 1.6011357490535425, + "grad_norm": 4.5625, + "learning_rate": 0.01630981654993546, + "loss": 3.4763, + "mean_token_accuracy": 0.36163681745529175, + "num_tokens": 3027839506.0, + "step": 5921 + }, + { + "epoch": 1.6014061654948621, + "grad_norm": 2.125, + "learning_rate": 0.01630854358250122, + "loss": 3.1387, + "mean_token_accuracy": 0.4246726632118225, + "num_tokens": 3028297979.0, + "step": 5922 + }, + { + "epoch": 1.6016765819361818, + "grad_norm": 2.96875, + "learning_rate": 0.016307270452185253, + "loss": 3.3014, + "mean_token_accuracy": 0.37746310234069824, + "num_tokens": 3028810475.0, + "step": 5923 + }, + { + "epoch": 1.6019469983775014, + "grad_norm": 2.46875, + "learning_rate": 0.016305997159026617, + "loss": 3.3329, + "mean_token_accuracy": 0.39211305975914, + "num_tokens": 3029301110.0, + "step": 5924 + }, + { + "epoch": 1.602217414818821, + "grad_norm": 2.875, + "learning_rate": 0.01630472370306439, + "loss": 3.2631, + "mean_token_accuracy": 0.37972837686538696, + "num_tokens": 3029825351.0, + "step": 5925 + }, + { + "epoch": 1.6024878312601407, + "grad_norm": 2.578125, + "learning_rate": 0.016303450084337635, + "loss": 3.3074, + "mean_token_accuracy": 0.3784705102443695, + "num_tokens": 3030302062.0, + "step": 5926 + }, + { + "epoch": 1.6027582477014604, + "grad_norm": 3.71875, + "learning_rate": 0.016302176302885437, + "loss": 3.3079, + "mean_token_accuracy": 0.39282649755477905, + "num_tokens": 3030826276.0, + "step": 5927 + }, + { + "epoch": 1.60302866414278, + "grad_norm": 3.015625, + "learning_rate": 0.016300902358746875, + "loss": 2.9486, + "mean_token_accuracy": 0.41652020812034607, + "num_tokens": 3031323985.0, + "step": 5928 + }, + { + "epoch": 1.6032990805840996, + "grad_norm": 2.8125, + "learning_rate": 0.016299628251961046, + "loss": 3.3567, + "mean_token_accuracy": 0.36805087327957153, + "num_tokens": 3031848157.0, + "step": 5929 + }, + { + "epoch": 1.6035694970254193, + "grad_norm": 3.265625, + "learning_rate": 0.016298353982567033, + "loss": 3.5724, + "mean_token_accuracy": 0.37079960107803345, + "num_tokens": 3032338699.0, + "step": 5930 + }, + { + "epoch": 1.603839913466739, + "grad_norm": 38.5, + "learning_rate": 0.016297079550603936, + "loss": 10.7485, + "mean_token_accuracy": 0.011473109945654869, + "num_tokens": 3032862914.0, + "step": 5931 + }, + { + "epoch": 1.6041103299080586, + "grad_norm": 4.75, + "learning_rate": 0.016295804956110865, + "loss": 3.5383, + "mean_token_accuracy": 0.33029472827911377, + "num_tokens": 3033387107.0, + "step": 5932 + }, + { + "epoch": 1.604380746349378, + "grad_norm": 3.078125, + "learning_rate": 0.016294530199126914, + "loss": 3.3984, + "mean_token_accuracy": 0.38061973452568054, + "num_tokens": 3033847697.0, + "step": 5933 + }, + { + "epoch": 1.6046511627906976, + "grad_norm": 2.796875, + "learning_rate": 0.016293255279691218, + "loss": 3.2142, + "mean_token_accuracy": 0.379577100276947, + "num_tokens": 3034371767.0, + "step": 5934 + }, + { + "epoch": 1.6049215792320173, + "grad_norm": 3.15625, + "learning_rate": 0.01629198019784288, + "loss": 3.3587, + "mean_token_accuracy": 0.35228431224823, + "num_tokens": 3034895879.0, + "step": 5935 + }, + { + "epoch": 1.605191995673337, + "grad_norm": 2.421875, + "learning_rate": 0.016290704953621025, + "loss": 3.3171, + "mean_token_accuracy": 0.3859173655509949, + "num_tokens": 3035420150.0, + "step": 5936 + }, + { + "epoch": 1.6054624121146566, + "grad_norm": 12.625, + "learning_rate": 0.016289429547064783, + "loss": 3.1182, + "mean_token_accuracy": 0.388115257024765, + "num_tokens": 3035944433.0, + "step": 5937 + }, + { + "epoch": 1.6057328285559762, + "grad_norm": 2.6875, + "learning_rate": 0.01628815397821329, + "loss": 3.5477, + "mean_token_accuracy": 0.3582558035850525, + "num_tokens": 3036468714.0, + "step": 5938 + }, + { + "epoch": 1.6060032449972959, + "grad_norm": 2.234375, + "learning_rate": 0.01628687824710568, + "loss": 3.4466, + "mean_token_accuracy": 0.3868185877799988, + "num_tokens": 3036945113.0, + "step": 5939 + }, + { + "epoch": 1.6062736614386155, + "grad_norm": 3.078125, + "learning_rate": 0.016285602353781098, + "loss": 3.1674, + "mean_token_accuracy": 0.3919040858745575, + "num_tokens": 3037427467.0, + "step": 5940 + }, + { + "epoch": 1.6065440778799351, + "grad_norm": 3.09375, + "learning_rate": 0.01628432629827869, + "loss": 3.1215, + "mean_token_accuracy": 0.39724916219711304, + "num_tokens": 3037951650.0, + "step": 5941 + }, + { + "epoch": 1.6068144943212548, + "grad_norm": 2.65625, + "learning_rate": 0.016283050080637615, + "loss": 3.2593, + "mean_token_accuracy": 0.3831745386123657, + "num_tokens": 3038475864.0, + "step": 5942 + }, + { + "epoch": 1.6070849107625742, + "grad_norm": 2.71875, + "learning_rate": 0.016281773700897027, + "loss": 3.3065, + "mean_token_accuracy": 0.3923853039741516, + "num_tokens": 3039000140.0, + "step": 5943 + }, + { + "epoch": 1.6073553272038938, + "grad_norm": 2.625, + "learning_rate": 0.016280497159096085, + "loss": 3.2577, + "mean_token_accuracy": 0.38224661350250244, + "num_tokens": 3039524292.0, + "step": 5944 + }, + { + "epoch": 1.6076257436452135, + "grad_norm": 2.90625, + "learning_rate": 0.016279220455273965, + "loss": 3.2339, + "mean_token_accuracy": 0.3800657093524933, + "num_tokens": 3040048467.0, + "step": 5945 + }, + { + "epoch": 1.6078961600865331, + "grad_norm": 3.203125, + "learning_rate": 0.016277943589469834, + "loss": 3.0389, + "mean_token_accuracy": 0.3972659707069397, + "num_tokens": 3040572593.0, + "step": 5946 + }, + { + "epoch": 1.6081665765278528, + "grad_norm": 2.890625, + "learning_rate": 0.016276666561722872, + "loss": 3.2107, + "mean_token_accuracy": 0.3973160982131958, + "num_tokens": 3041036397.0, + "step": 5947 + }, + { + "epoch": 1.6084369929691724, + "grad_norm": 8.1875, + "learning_rate": 0.016275389372072263, + "loss": 2.8928, + "mean_token_accuracy": 0.47113314270973206, + "num_tokens": 3041560614.0, + "step": 5948 + }, + { + "epoch": 1.608707409410492, + "grad_norm": 2.109375, + "learning_rate": 0.01627411202055719, + "loss": 3.2533, + "mean_token_accuracy": 0.3987295925617218, + "num_tokens": 3042084735.0, + "step": 5949 + }, + { + "epoch": 1.6089778258518117, + "grad_norm": 3.578125, + "learning_rate": 0.01627283450721686, + "loss": 3.3976, + "mean_token_accuracy": 0.374055951833725, + "num_tokens": 3042608926.0, + "step": 5950 + }, + { + "epoch": 1.6092482422931313, + "grad_norm": 22.125, + "learning_rate": 0.01627155683209045, + "loss": 11.5685, + "mean_token_accuracy": 0.00874580442905426, + "num_tokens": 3043072020.0, + "step": 5951 + }, + { + "epoch": 1.609518658734451, + "grad_norm": 8.6875, + "learning_rate": 0.016270278995217182, + "loss": 3.9784, + "mean_token_accuracy": 0.3462751507759094, + "num_tokens": 3043566010.0, + "step": 5952 + }, + { + "epoch": 1.6097890751757706, + "grad_norm": 3.0, + "learning_rate": 0.01626900099663625, + "loss": 3.2826, + "mean_token_accuracy": 0.33806300163269043, + "num_tokens": 3044090151.0, + "step": 5953 + }, + { + "epoch": 1.6100594916170903, + "grad_norm": 2.671875, + "learning_rate": 0.01626772283638688, + "loss": 3.3563, + "mean_token_accuracy": 0.37209177017211914, + "num_tokens": 3044614369.0, + "step": 5954 + }, + { + "epoch": 1.61032990805841, + "grad_norm": 2.890625, + "learning_rate": 0.01626644451450827, + "loss": 3.3582, + "mean_token_accuracy": 0.3865430951118469, + "num_tokens": 3045095740.0, + "step": 5955 + }, + { + "epoch": 1.6106003244997296, + "grad_norm": 2.734375, + "learning_rate": 0.016265166031039665, + "loss": 3.3197, + "mean_token_accuracy": 0.3667396605014801, + "num_tokens": 3045591360.0, + "step": 5956 + }, + { + "epoch": 1.6108707409410492, + "grad_norm": 3.1875, + "learning_rate": 0.01626388738602028, + "loss": 3.3776, + "mean_token_accuracy": 0.38765403628349304, + "num_tokens": 3046095796.0, + "step": 5957 + }, + { + "epoch": 1.6111411573823688, + "grad_norm": 3.015625, + "learning_rate": 0.016262608579489347, + "loss": 3.5087, + "mean_token_accuracy": 0.3610993027687073, + "num_tokens": 3046563175.0, + "step": 5958 + }, + { + "epoch": 1.6114115738236885, + "grad_norm": 3.453125, + "learning_rate": 0.01626132961148611, + "loss": 2.854, + "mean_token_accuracy": 0.4486141800880432, + "num_tokens": 3047039283.0, + "step": 5959 + }, + { + "epoch": 1.6116819902650081, + "grad_norm": 3.03125, + "learning_rate": 0.016260050482049804, + "loss": 3.1973, + "mean_token_accuracy": 0.3892345428466797, + "num_tokens": 3047563466.0, + "step": 5960 + }, + { + "epoch": 1.6119524067063278, + "grad_norm": 3.5, + "learning_rate": 0.01625877119121968, + "loss": 3.423, + "mean_token_accuracy": 0.4105888605117798, + "num_tokens": 3048025184.0, + "step": 5961 + }, + { + "epoch": 1.6122228231476474, + "grad_norm": 2.59375, + "learning_rate": 0.016257491739035, + "loss": 3.3656, + "mean_token_accuracy": 0.3764706254005432, + "num_tokens": 3048549357.0, + "step": 5962 + }, + { + "epoch": 1.612493239588967, + "grad_norm": 3.484375, + "learning_rate": 0.016256212125535, + "loss": 3.1247, + "mean_token_accuracy": 0.35796692967414856, + "num_tokens": 3049073503.0, + "step": 5963 + }, + { + "epoch": 1.6127636560302867, + "grad_norm": 2.984375, + "learning_rate": 0.016254932350758963, + "loss": 3.4148, + "mean_token_accuracy": 0.383728951215744, + "num_tokens": 3049578539.0, + "step": 5964 + }, + { + "epoch": 1.6130340724716064, + "grad_norm": 3.296875, + "learning_rate": 0.016253652414746144, + "loss": 3.4816, + "mean_token_accuracy": 0.3543585538864136, + "num_tokens": 3050102726.0, + "step": 5965 + }, + { + "epoch": 1.613304488912926, + "grad_norm": 2.4375, + "learning_rate": 0.016252372317535818, + "loss": 3.1051, + "mean_token_accuracy": 0.4091062545776367, + "num_tokens": 3050607080.0, + "step": 5966 + }, + { + "epoch": 1.6135749053542456, + "grad_norm": 2.90625, + "learning_rate": 0.016251092059167267, + "loss": 3.0692, + "mean_token_accuracy": 0.36455345153808594, + "num_tokens": 3051107627.0, + "step": 5967 + }, + { + "epoch": 1.6138453217955653, + "grad_norm": 2.515625, + "learning_rate": 0.016249811639679763, + "loss": 3.1604, + "mean_token_accuracy": 0.39371031522750854, + "num_tokens": 3051614847.0, + "step": 5968 + }, + { + "epoch": 1.614115738236885, + "grad_norm": 3.609375, + "learning_rate": 0.016248531059112607, + "loss": 3.3287, + "mean_token_accuracy": 0.3897234797477722, + "num_tokens": 3052138992.0, + "step": 5969 + }, + { + "epoch": 1.6143861546782046, + "grad_norm": 2.765625, + "learning_rate": 0.01624725031750508, + "loss": 3.2105, + "mean_token_accuracy": 0.40043577551841736, + "num_tokens": 3052613650.0, + "step": 5970 + }, + { + "epoch": 1.6146565711195242, + "grad_norm": 4.78125, + "learning_rate": 0.01624596941489648, + "loss": 10.9145, + "mean_token_accuracy": 1.5497756976401433e-05, + "num_tokens": 3053128356.0, + "step": 5971 + }, + { + "epoch": 1.6149269875608439, + "grad_norm": 6.125, + "learning_rate": 0.01624468835132611, + "loss": 3.8609, + "mean_token_accuracy": 0.34264370799064636, + "num_tokens": 3053652474.0, + "step": 5972 + }, + { + "epoch": 1.6151974040021635, + "grad_norm": 2.625, + "learning_rate": 0.01624340712683328, + "loss": 3.3528, + "mean_token_accuracy": 0.33720093965530396, + "num_tokens": 3054176671.0, + "step": 5973 + }, + { + "epoch": 1.615467820443483, + "grad_norm": 2.65625, + "learning_rate": 0.0162421257414573, + "loss": 3.2407, + "mean_token_accuracy": 0.36777234077453613, + "num_tokens": 3054700819.0, + "step": 5974 + }, + { + "epoch": 1.6157382368848026, + "grad_norm": 3.15625, + "learning_rate": 0.016240844195237487, + "loss": 3.3322, + "mean_token_accuracy": 0.38555508852005005, + "num_tokens": 3055168923.0, + "step": 5975 + }, + { + "epoch": 1.6160086533261222, + "grad_norm": 2.359375, + "learning_rate": 0.01623956248821316, + "loss": 3.2882, + "mean_token_accuracy": 0.38969388604164124, + "num_tokens": 3055632997.0, + "step": 5976 + }, + { + "epoch": 1.6162790697674418, + "grad_norm": 2.65625, + "learning_rate": 0.016238280620423647, + "loss": 3.0862, + "mean_token_accuracy": 0.4475981295108795, + "num_tokens": 3056092408.0, + "step": 5977 + }, + { + "epoch": 1.6165494862087615, + "grad_norm": 2.875, + "learning_rate": 0.016236998591908282, + "loss": 3.0438, + "mean_token_accuracy": 0.3890003263950348, + "num_tokens": 3056616585.0, + "step": 5978 + }, + { + "epoch": 1.6168199026500811, + "grad_norm": 2.46875, + "learning_rate": 0.016235716402706397, + "loss": 3.3599, + "mean_token_accuracy": 0.4106864631175995, + "num_tokens": 3057080872.0, + "step": 5979 + }, + { + "epoch": 1.6170903190914008, + "grad_norm": 2.734375, + "learning_rate": 0.016234434052857336, + "loss": 3.3401, + "mean_token_accuracy": 0.36468446254730225, + "num_tokens": 3057605145.0, + "step": 5980 + }, + { + "epoch": 1.6173607355327204, + "grad_norm": 3.28125, + "learning_rate": 0.016233151542400445, + "loss": 3.562, + "mean_token_accuracy": 0.35647037625312805, + "num_tokens": 3058129359.0, + "step": 5981 + }, + { + "epoch": 1.61763115197404, + "grad_norm": 2.625, + "learning_rate": 0.016231868871375076, + "loss": 3.3044, + "mean_token_accuracy": 0.3924390971660614, + "num_tokens": 3058630327.0, + "step": 5982 + }, + { + "epoch": 1.6179015684153597, + "grad_norm": 3.15625, + "learning_rate": 0.016230586039820583, + "loss": 3.4125, + "mean_token_accuracy": 0.35419589281082153, + "num_tokens": 3059154518.0, + "step": 5983 + }, + { + "epoch": 1.6181719848566791, + "grad_norm": 2.4375, + "learning_rate": 0.01622930304777633, + "loss": 3.1115, + "mean_token_accuracy": 0.3960345387458801, + "num_tokens": 3059628553.0, + "step": 5984 + }, + { + "epoch": 1.6184424012979988, + "grad_norm": 3.09375, + "learning_rate": 0.01622801989528168, + "loss": 3.3632, + "mean_token_accuracy": 0.39259329438209534, + "num_tokens": 3060100155.0, + "step": 5985 + }, + { + "epoch": 1.6187128177393184, + "grad_norm": 3.828125, + "learning_rate": 0.016226736582376, + "loss": 3.3783, + "mean_token_accuracy": 0.3685217499732971, + "num_tokens": 3060624410.0, + "step": 5986 + }, + { + "epoch": 1.618983234180638, + "grad_norm": 2.921875, + "learning_rate": 0.01622545310909868, + "loss": 3.2095, + "mean_token_accuracy": 0.39359724521636963, + "num_tokens": 3061148590.0, + "step": 5987 + }, + { + "epoch": 1.6192536506219577, + "grad_norm": 2.859375, + "learning_rate": 0.016224169475489082, + "loss": 3.2619, + "mean_token_accuracy": 0.37714099884033203, + "num_tokens": 3061672583.0, + "step": 5988 + }, + { + "epoch": 1.6195240670632773, + "grad_norm": 3.5625, + "learning_rate": 0.01622288568158661, + "loss": 3.2643, + "mean_token_accuracy": 0.39363276958465576, + "num_tokens": 3062196648.0, + "step": 5989 + }, + { + "epoch": 1.619794483504597, + "grad_norm": 4.875, + "learning_rate": 0.01622160172743064, + "loss": 3.3281, + "mean_token_accuracy": 0.3854222893714905, + "num_tokens": 3062697533.0, + "step": 5990 + }, + { + "epoch": 1.6200648999459166, + "grad_norm": 5.09375, + "learning_rate": 0.016220317613060576, + "loss": 9.9434, + "mean_token_accuracy": 0.009782025590538979, + "num_tokens": 3063221566.0, + "step": 5991 + }, + { + "epoch": 1.6203353163872363, + "grad_norm": 10.3125, + "learning_rate": 0.016219033338515813, + "loss": 3.8347, + "mean_token_accuracy": 0.2894371747970581, + "num_tokens": 3063745847.0, + "step": 5992 + }, + { + "epoch": 1.620605732828556, + "grad_norm": 2.453125, + "learning_rate": 0.016217748903835764, + "loss": 3.2795, + "mean_token_accuracy": 0.35933634638786316, + "num_tokens": 3064270025.0, + "step": 5993 + }, + { + "epoch": 1.6208761492698756, + "grad_norm": 3.0, + "learning_rate": 0.01621646430905983, + "loss": 3.2249, + "mean_token_accuracy": 0.3979024887084961, + "num_tokens": 3064772647.0, + "step": 5994 + }, + { + "epoch": 1.6211465657111952, + "grad_norm": 2.890625, + "learning_rate": 0.01621517955422743, + "loss": 3.34, + "mean_token_accuracy": 0.35974591970443726, + "num_tokens": 3065296882.0, + "step": 5995 + }, + { + "epoch": 1.6214169821525148, + "grad_norm": 3.171875, + "learning_rate": 0.016213894639377984, + "loss": 3.3396, + "mean_token_accuracy": 0.3784846365451813, + "num_tokens": 3065776617.0, + "step": 5996 + }, + { + "epoch": 1.6216873985938345, + "grad_norm": 3.203125, + "learning_rate": 0.01621260956455092, + "loss": 3.4064, + "mean_token_accuracy": 0.3633345365524292, + "num_tokens": 3066300876.0, + "step": 5997 + }, + { + "epoch": 1.6219578150351541, + "grad_norm": 2.96875, + "learning_rate": 0.016211324329785667, + "loss": 3.246, + "mean_token_accuracy": 0.416703462600708, + "num_tokens": 3066825075.0, + "step": 5998 + }, + { + "epoch": 1.6222282314764738, + "grad_norm": 2.453125, + "learning_rate": 0.016210038935121653, + "loss": 3.2593, + "mean_token_accuracy": 0.3826320767402649, + "num_tokens": 3067349282.0, + "step": 5999 + }, + { + "epoch": 1.6224986479177934, + "grad_norm": 3.515625, + "learning_rate": 0.016208753380598323, + "loss": 3.5093, + "mean_token_accuracy": 0.36698269844055176, + "num_tokens": 3067873548.0, + "step": 6000 + }, + { + "epoch": 1.622769064359113, + "grad_norm": 3.15625, + "learning_rate": 0.016207467666255124, + "loss": 3.497, + "mean_token_accuracy": 0.37296196818351746, + "num_tokens": 3068397671.0, + "step": 6001 + }, + { + "epoch": 1.6230394808004327, + "grad_norm": 4.46875, + "learning_rate": 0.016206181792131506, + "loss": 3.3184, + "mean_token_accuracy": 0.3804183006286621, + "num_tokens": 3068904060.0, + "step": 6002 + }, + { + "epoch": 1.6233098972417523, + "grad_norm": 2.84375, + "learning_rate": 0.016204895758266915, + "loss": 3.3921, + "mean_token_accuracy": 0.3740311861038208, + "num_tokens": 3069428294.0, + "step": 6003 + }, + { + "epoch": 1.623580313683072, + "grad_norm": 3.453125, + "learning_rate": 0.016203609564700815, + "loss": 3.1451, + "mean_token_accuracy": 0.37862929701805115, + "num_tokens": 3069952377.0, + "step": 6004 + }, + { + "epoch": 1.6238507301243916, + "grad_norm": 2.8125, + "learning_rate": 0.016202323211472674, + "loss": 3.0259, + "mean_token_accuracy": 0.39101243019104004, + "num_tokens": 3070476652.0, + "step": 6005 + }, + { + "epoch": 1.6241211465657113, + "grad_norm": 3.546875, + "learning_rate": 0.016201036698621957, + "loss": 3.2991, + "mean_token_accuracy": 0.33887016773223877, + "num_tokens": 3071000816.0, + "step": 6006 + }, + { + "epoch": 1.624391563007031, + "grad_norm": 3.34375, + "learning_rate": 0.01619975002618814, + "loss": 3.008, + "mean_token_accuracy": 0.397485613822937, + "num_tokens": 3071525089.0, + "step": 6007 + }, + { + "epoch": 1.6246619794483506, + "grad_norm": 2.265625, + "learning_rate": 0.0161984631942107, + "loss": 3.3158, + "mean_token_accuracy": 0.3797111213207245, + "num_tokens": 3072049272.0, + "step": 6008 + }, + { + "epoch": 1.6249323958896702, + "grad_norm": 2.625, + "learning_rate": 0.01619717620272912, + "loss": 3.0811, + "mean_token_accuracy": 0.3854040205478668, + "num_tokens": 3072573269.0, + "step": 6009 + }, + { + "epoch": 1.6252028123309898, + "grad_norm": 2.109375, + "learning_rate": 0.016195889051782892, + "loss": 3.2655, + "mean_token_accuracy": 0.39614516496658325, + "num_tokens": 3073097449.0, + "step": 6010 + }, + { + "epoch": 1.6254732287723095, + "grad_norm": 2.046875, + "learning_rate": 0.016194601741411505, + "loss": 10.9272, + "mean_token_accuracy": 7.708260454819538e-06, + "num_tokens": 3073621585.0, + "step": 6011 + }, + { + "epoch": 1.6257436452136291, + "grad_norm": 8.8125, + "learning_rate": 0.016193314271654463, + "loss": 4.1827, + "mean_token_accuracy": 0.29764237999916077, + "num_tokens": 3074145790.0, + "step": 6012 + }, + { + "epoch": 1.6260140616549488, + "grad_norm": 2.171875, + "learning_rate": 0.016192026642551267, + "loss": 3.3073, + "mean_token_accuracy": 0.39012840390205383, + "num_tokens": 3074669911.0, + "step": 6013 + }, + { + "epoch": 1.6262844780962684, + "grad_norm": 2.484375, + "learning_rate": 0.01619073885414142, + "loss": 3.3762, + "mean_token_accuracy": 0.38178446888923645, + "num_tokens": 3075194191.0, + "step": 6014 + }, + { + "epoch": 1.6265548945375878, + "grad_norm": 3.171875, + "learning_rate": 0.016189450906464446, + "loss": 3.3136, + "mean_token_accuracy": 0.3742026388645172, + "num_tokens": 3075718437.0, + "step": 6015 + }, + { + "epoch": 1.6268253109789075, + "grad_norm": 3.59375, + "learning_rate": 0.016188162799559855, + "loss": 3.2698, + "mean_token_accuracy": 0.372772753238678, + "num_tokens": 3076242691.0, + "step": 6016 + }, + { + "epoch": 1.6270957274202271, + "grad_norm": 3.15625, + "learning_rate": 0.01618687453346717, + "loss": 3.49, + "mean_token_accuracy": 0.35196077823638916, + "num_tokens": 3076766959.0, + "step": 6017 + }, + { + "epoch": 1.6273661438615468, + "grad_norm": 3.375, + "learning_rate": 0.016185586108225923, + "loss": 3.3851, + "mean_token_accuracy": 0.377138614654541, + "num_tokens": 3077291051.0, + "step": 6018 + }, + { + "epoch": 1.6276365603028664, + "grad_norm": 3.109375, + "learning_rate": 0.01618429752387564, + "loss": 3.2757, + "mean_token_accuracy": 0.3708699941635132, + "num_tokens": 3077815333.0, + "step": 6019 + }, + { + "epoch": 1.627906976744186, + "grad_norm": 2.484375, + "learning_rate": 0.01618300878045587, + "loss": 3.1489, + "mean_token_accuracy": 0.3907981216907501, + "num_tokens": 3078339547.0, + "step": 6020 + }, + { + "epoch": 1.6281773931855057, + "grad_norm": 2.53125, + "learning_rate": 0.016181719878006144, + "loss": 3.4405, + "mean_token_accuracy": 0.35075539350509644, + "num_tokens": 3078863753.0, + "step": 6021 + }, + { + "epoch": 1.6284478096268253, + "grad_norm": 2.65625, + "learning_rate": 0.016180430816566015, + "loss": 3.2838, + "mean_token_accuracy": 0.397932767868042, + "num_tokens": 3079356405.0, + "step": 6022 + }, + { + "epoch": 1.628718226068145, + "grad_norm": 3.046875, + "learning_rate": 0.016179141596175034, + "loss": 3.1515, + "mean_token_accuracy": 0.38621431589126587, + "num_tokens": 3079880576.0, + "step": 6023 + }, + { + "epoch": 1.6289886425094646, + "grad_norm": 3.09375, + "learning_rate": 0.016177852216872755, + "loss": 3.1881, + "mean_token_accuracy": 0.39172664284706116, + "num_tokens": 3080404845.0, + "step": 6024 + }, + { + "epoch": 1.629259058950784, + "grad_norm": 27.875, + "learning_rate": 0.01617656267869875, + "loss": 3.3015, + "mean_token_accuracy": 0.38957488536834717, + "num_tokens": 3080929001.0, + "step": 6025 + }, + { + "epoch": 1.6295294753921037, + "grad_norm": 4.78125, + "learning_rate": 0.016175272981692572, + "loss": 3.532, + "mean_token_accuracy": 0.3735833764076233, + "num_tokens": 3081394101.0, + "step": 6026 + }, + { + "epoch": 1.6297998918334233, + "grad_norm": 2.0, + "learning_rate": 0.016173983125893802, + "loss": 3.0805, + "mean_token_accuracy": 0.3921218812465668, + "num_tokens": 3081918309.0, + "step": 6027 + }, + { + "epoch": 1.630070308274743, + "grad_norm": 2.671875, + "learning_rate": 0.01617269311134201, + "loss": 3.2626, + "mean_token_accuracy": 0.39504462480545044, + "num_tokens": 3082442590.0, + "step": 6028 + }, + { + "epoch": 1.6303407247160626, + "grad_norm": 3.109375, + "learning_rate": 0.016171402938076788, + "loss": 3.344, + "mean_token_accuracy": 0.3941320776939392, + "num_tokens": 3082950206.0, + "step": 6029 + }, + { + "epoch": 1.6306111411573823, + "grad_norm": 3.359375, + "learning_rate": 0.016170112606137713, + "loss": 3.4403, + "mean_token_accuracy": 0.37440362572669983, + "num_tokens": 3083474403.0, + "step": 6030 + }, + { + "epoch": 1.630881557598702, + "grad_norm": 14.5625, + "learning_rate": 0.016168822115564377, + "loss": 9.9669, + "mean_token_accuracy": 0.005522933788597584, + "num_tokens": 3083957346.0, + "step": 6031 + }, + { + "epoch": 1.6311519740400215, + "grad_norm": 9.75, + "learning_rate": 0.01616753146639638, + "loss": 3.8093, + "mean_token_accuracy": 0.2687603831291199, + "num_tokens": 3084481505.0, + "step": 6032 + }, + { + "epoch": 1.6314223904813412, + "grad_norm": 3.140625, + "learning_rate": 0.016166240658673323, + "loss": 3.3461, + "mean_token_accuracy": 0.3648940920829773, + "num_tokens": 3084969748.0, + "step": 6033 + }, + { + "epoch": 1.6316928069226608, + "grad_norm": 2.796875, + "learning_rate": 0.016164949692434805, + "loss": 3.5976, + "mean_token_accuracy": 0.37525439262390137, + "num_tokens": 3085452078.0, + "step": 6034 + }, + { + "epoch": 1.6319632233639805, + "grad_norm": 3.390625, + "learning_rate": 0.016163658567720444, + "loss": 3.4741, + "mean_token_accuracy": 0.364693284034729, + "num_tokens": 3085976284.0, + "step": 6035 + }, + { + "epoch": 1.6322336398053001, + "grad_norm": 2.609375, + "learning_rate": 0.01616236728456985, + "loss": 3.3677, + "mean_token_accuracy": 0.3903312087059021, + "num_tokens": 3086500561.0, + "step": 6036 + }, + { + "epoch": 1.6325040562466198, + "grad_norm": 3.5, + "learning_rate": 0.01616107584302265, + "loss": 3.3941, + "mean_token_accuracy": 0.37188658118247986, + "num_tokens": 3086989032.0, + "step": 6037 + }, + { + "epoch": 1.6327744726879394, + "grad_norm": 2.828125, + "learning_rate": 0.016159784243118463, + "loss": 3.4586, + "mean_token_accuracy": 0.3867136836051941, + "num_tokens": 3087513194.0, + "step": 6038 + }, + { + "epoch": 1.633044889129259, + "grad_norm": 2.890625, + "learning_rate": 0.01615849248489692, + "loss": 3.2642, + "mean_token_accuracy": 0.3712729513645172, + "num_tokens": 3088037369.0, + "step": 6039 + }, + { + "epoch": 1.6333153055705787, + "grad_norm": 2.5625, + "learning_rate": 0.016157200568397662, + "loss": 3.3044, + "mean_token_accuracy": 0.3932929039001465, + "num_tokens": 3088551363.0, + "step": 6040 + }, + { + "epoch": 1.6335857220118983, + "grad_norm": 3.03125, + "learning_rate": 0.016155908493660324, + "loss": 3.2076, + "mean_token_accuracy": 0.3866519331932068, + "num_tokens": 3089028596.0, + "step": 6041 + }, + { + "epoch": 1.633856138453218, + "grad_norm": 2.75, + "learning_rate": 0.016154616260724546, + "loss": 2.997, + "mean_token_accuracy": 0.41386428475379944, + "num_tokens": 3089523633.0, + "step": 6042 + }, + { + "epoch": 1.6341265548945376, + "grad_norm": 1.8671875, + "learning_rate": 0.016153323869629987, + "loss": 3.361, + "mean_token_accuracy": 0.3913392126560211, + "num_tokens": 3089972529.0, + "step": 6043 + }, + { + "epoch": 1.6343969713358573, + "grad_norm": 2.578125, + "learning_rate": 0.016152031320416295, + "loss": 3.5031, + "mean_token_accuracy": 0.40211260318756104, + "num_tokens": 3090464076.0, + "step": 6044 + }, + { + "epoch": 1.634667387777177, + "grad_norm": 3.359375, + "learning_rate": 0.01615073861312313, + "loss": 3.2553, + "mean_token_accuracy": 0.39466047286987305, + "num_tokens": 3090988286.0, + "step": 6045 + }, + { + "epoch": 1.6349378042184965, + "grad_norm": 3.5625, + "learning_rate": 0.016149445747790158, + "loss": 3.4121, + "mean_token_accuracy": 0.3848686218261719, + "num_tokens": 3091476816.0, + "step": 6046 + }, + { + "epoch": 1.6352082206598162, + "grad_norm": 2.9375, + "learning_rate": 0.01614815272445705, + "loss": 3.2091, + "mean_token_accuracy": 0.3838846683502197, + "num_tokens": 3091999405.0, + "step": 6047 + }, + { + "epoch": 1.6354786371011358, + "grad_norm": 2.578125, + "learning_rate": 0.01614685954316347, + "loss": 3.2147, + "mean_token_accuracy": 0.3951336145401001, + "num_tokens": 3092523529.0, + "step": 6048 + }, + { + "epoch": 1.6357490535424555, + "grad_norm": 3.1875, + "learning_rate": 0.016145566203949108, + "loss": 3.3004, + "mean_token_accuracy": 0.3642066717147827, + "num_tokens": 3092998662.0, + "step": 6049 + }, + { + "epoch": 1.6360194699837751, + "grad_norm": 3.828125, + "learning_rate": 0.016144272706853643, + "loss": 3.4176, + "mean_token_accuracy": 0.3936457633972168, + "num_tokens": 3093494472.0, + "step": 6050 + }, + { + "epoch": 1.6362898864250948, + "grad_norm": 4.9375, + "learning_rate": 0.01614297905191676, + "loss": 10.2885, + "mean_token_accuracy": 0.0002929999609477818, + "num_tokens": 3093988651.0, + "step": 6051 + }, + { + "epoch": 1.6365603028664144, + "grad_norm": 10.3125, + "learning_rate": 0.016141685239178156, + "loss": 3.7725, + "mean_token_accuracy": 0.3065275549888611, + "num_tokens": 3094491400.0, + "step": 6052 + }, + { + "epoch": 1.636830719307734, + "grad_norm": 1.90625, + "learning_rate": 0.01614039126867753, + "loss": 3.3905, + "mean_token_accuracy": 0.36007916927337646, + "num_tokens": 3095015678.0, + "step": 6053 + }, + { + "epoch": 1.6371011357490537, + "grad_norm": 2.328125, + "learning_rate": 0.01613909714045458, + "loss": 3.5072, + "mean_token_accuracy": 0.37453538179397583, + "num_tokens": 3095539896.0, + "step": 6054 + }, + { + "epoch": 1.6373715521903733, + "grad_norm": 3.71875, + "learning_rate": 0.016137802854549018, + "loss": 3.1301, + "mean_token_accuracy": 0.3766515851020813, + "num_tokens": 3096064088.0, + "step": 6055 + }, + { + "epoch": 1.6376419686316928, + "grad_norm": 3.1875, + "learning_rate": 0.016136508411000557, + "loss": 3.0548, + "mean_token_accuracy": 0.401218980550766, + "num_tokens": 3096588183.0, + "step": 6056 + }, + { + "epoch": 1.6379123850730124, + "grad_norm": 2.8125, + "learning_rate": 0.016135213809848913, + "loss": 3.3839, + "mean_token_accuracy": 0.3708169460296631, + "num_tokens": 3097112447.0, + "step": 6057 + }, + { + "epoch": 1.638182801514332, + "grad_norm": 3.265625, + "learning_rate": 0.016133919051133802, + "loss": 3.5194, + "mean_token_accuracy": 0.36923760175704956, + "num_tokens": 3097599554.0, + "step": 6058 + }, + { + "epoch": 1.6384532179556517, + "grad_norm": 2.6875, + "learning_rate": 0.016132624134894965, + "loss": 3.3477, + "mean_token_accuracy": 0.3811371922492981, + "num_tokens": 3098123803.0, + "step": 6059 + }, + { + "epoch": 1.6387236343969713, + "grad_norm": 3.828125, + "learning_rate": 0.01613132906117212, + "loss": 3.457, + "mean_token_accuracy": 0.38039109110832214, + "num_tokens": 3098617055.0, + "step": 6060 + }, + { + "epoch": 1.638994050838291, + "grad_norm": 2.71875, + "learning_rate": 0.016130033830005008, + "loss": 3.2953, + "mean_token_accuracy": 0.3896465301513672, + "num_tokens": 3099141307.0, + "step": 6061 + }, + { + "epoch": 1.6392644672796106, + "grad_norm": 3.03125, + "learning_rate": 0.016128738441433376, + "loss": 3.4536, + "mean_token_accuracy": 0.35571610927581787, + "num_tokens": 3099665527.0, + "step": 6062 + }, + { + "epoch": 1.6395348837209303, + "grad_norm": 2.296875, + "learning_rate": 0.016127442895496964, + "loss": 3.0976, + "mean_token_accuracy": 0.39010947942733765, + "num_tokens": 3100189789.0, + "step": 6063 + }, + { + "epoch": 1.63980530016225, + "grad_norm": 3.359375, + "learning_rate": 0.016126147192235527, + "loss": 3.1799, + "mean_token_accuracy": 0.3653668165206909, + "num_tokens": 3100714004.0, + "step": 6064 + }, + { + "epoch": 1.6400757166035695, + "grad_norm": 3.8125, + "learning_rate": 0.01612485133168881, + "loss": 3.253, + "mean_token_accuracy": 0.38449400663375854, + "num_tokens": 3101238277.0, + "step": 6065 + }, + { + "epoch": 1.640346133044889, + "grad_norm": 3.140625, + "learning_rate": 0.016123555313896594, + "loss": 3.305, + "mean_token_accuracy": 0.3730964958667755, + "num_tokens": 3101735998.0, + "step": 6066 + }, + { + "epoch": 1.6406165494862086, + "grad_norm": 2.671875, + "learning_rate": 0.01612225913889863, + "loss": 3.1014, + "mean_token_accuracy": 0.38734501600265503, + "num_tokens": 3102260268.0, + "step": 6067 + }, + { + "epoch": 1.6408869659275283, + "grad_norm": 2.796875, + "learning_rate": 0.01612096280673469, + "loss": 3.0961, + "mean_token_accuracy": 0.4064769148826599, + "num_tokens": 3102784406.0, + "step": 6068 + }, + { + "epoch": 1.641157382368848, + "grad_norm": 3.40625, + "learning_rate": 0.016119666317444555, + "loss": 3.3113, + "mean_token_accuracy": 0.3585522770881653, + "num_tokens": 3103308676.0, + "step": 6069 + }, + { + "epoch": 1.6414277988101675, + "grad_norm": 2.59375, + "learning_rate": 0.016118369671067997, + "loss": 2.9698, + "mean_token_accuracy": 0.39750367403030396, + "num_tokens": 3103832893.0, + "step": 6070 + }, + { + "epoch": 1.6416982152514872, + "grad_norm": 19.5, + "learning_rate": 0.01611707286764481, + "loss": 12.7236, + "mean_token_accuracy": 0.010615493170917034, + "num_tokens": 3104357119.0, + "step": 6071 + }, + { + "epoch": 1.6419686316928068, + "grad_norm": 7.5, + "learning_rate": 0.016115775907214774, + "loss": 3.7775, + "mean_token_accuracy": 0.34729328751564026, + "num_tokens": 3104804366.0, + "step": 6072 + }, + { + "epoch": 1.6422390481341265, + "grad_norm": 2.546875, + "learning_rate": 0.016114478789817692, + "loss": 3.233, + "mean_token_accuracy": 0.3619586229324341, + "num_tokens": 3105303257.0, + "step": 6073 + }, + { + "epoch": 1.642509464575446, + "grad_norm": 2.40625, + "learning_rate": 0.01611318151549336, + "loss": 3.4928, + "mean_token_accuracy": 0.36197400093078613, + "num_tokens": 3105827530.0, + "step": 6074 + }, + { + "epoch": 1.6427798810167658, + "grad_norm": 3.296875, + "learning_rate": 0.01611188408428158, + "loss": 3.2489, + "mean_token_accuracy": 0.3901931643486023, + "num_tokens": 3106283975.0, + "step": 6075 + }, + { + "epoch": 1.6430502974580854, + "grad_norm": 3.328125, + "learning_rate": 0.016110586496222167, + "loss": 3.2821, + "mean_token_accuracy": 0.3757399916648865, + "num_tokens": 3106808155.0, + "step": 6076 + }, + { + "epoch": 1.643320713899405, + "grad_norm": 3.15625, + "learning_rate": 0.016109288751354926, + "loss": 3.4145, + "mean_token_accuracy": 0.36628082394599915, + "num_tokens": 3107332361.0, + "step": 6077 + }, + { + "epoch": 1.6435911303407247, + "grad_norm": 2.84375, + "learning_rate": 0.016107990849719686, + "loss": 3.1276, + "mean_token_accuracy": 0.40220123529434204, + "num_tokens": 3107814724.0, + "step": 6078 + }, + { + "epoch": 1.6438615467820443, + "grad_norm": 3.296875, + "learning_rate": 0.01610669279135626, + "loss": 3.243, + "mean_token_accuracy": 0.3774360716342926, + "num_tokens": 3108338899.0, + "step": 6079 + }, + { + "epoch": 1.644131963223364, + "grad_norm": 2.890625, + "learning_rate": 0.016105394576304485, + "loss": 3.1866, + "mean_token_accuracy": 0.39077675342559814, + "num_tokens": 3108827801.0, + "step": 6080 + }, + { + "epoch": 1.6444023796646836, + "grad_norm": 2.5, + "learning_rate": 0.01610409620460419, + "loss": 2.9318, + "mean_token_accuracy": 0.41625845432281494, + "num_tokens": 3109289752.0, + "step": 6081 + }, + { + "epoch": 1.6446727961060033, + "grad_norm": 2.65625, + "learning_rate": 0.01610279767629521, + "loss": 3.3828, + "mean_token_accuracy": 0.3843304216861725, + "num_tokens": 3109813906.0, + "step": 6082 + }, + { + "epoch": 1.644943212547323, + "grad_norm": 3.6875, + "learning_rate": 0.016101498991417395, + "loss": 3.275, + "mean_token_accuracy": 0.3502517640590668, + "num_tokens": 3110338179.0, + "step": 6083 + }, + { + "epoch": 1.6452136289886425, + "grad_norm": 2.8125, + "learning_rate": 0.016100200150010586, + "loss": 3.321, + "mean_token_accuracy": 0.39207208156585693, + "num_tokens": 3110763929.0, + "step": 6084 + }, + { + "epoch": 1.6454840454299622, + "grad_norm": 2.84375, + "learning_rate": 0.01609890115211464, + "loss": 3.2602, + "mean_token_accuracy": 0.36924007534980774, + "num_tokens": 3111288141.0, + "step": 6085 + }, + { + "epoch": 1.6457544618712818, + "grad_norm": 2.046875, + "learning_rate": 0.016097601997769412, + "loss": 3.1635, + "mean_token_accuracy": 0.3949314057826996, + "num_tokens": 3111812350.0, + "step": 6086 + }, + { + "epoch": 1.6460248783126015, + "grad_norm": 2.75, + "learning_rate": 0.01609630268701476, + "loss": 2.8025, + "mean_token_accuracy": 0.4281958043575287, + "num_tokens": 3112336484.0, + "step": 6087 + }, + { + "epoch": 1.6462952947539211, + "grad_norm": 2.453125, + "learning_rate": 0.016095003219890557, + "loss": 3.0715, + "mean_token_accuracy": 0.39752036333084106, + "num_tokens": 3112860482.0, + "step": 6088 + }, + { + "epoch": 1.6465657111952408, + "grad_norm": 2.78125, + "learning_rate": 0.016093703596436672, + "loss": 3.332, + "mean_token_accuracy": 0.38162168860435486, + "num_tokens": 3113384671.0, + "step": 6089 + }, + { + "epoch": 1.6468361276365604, + "grad_norm": 2.546875, + "learning_rate": 0.01609240381669298, + "loss": 3.2348, + "mean_token_accuracy": 0.3906586468219757, + "num_tokens": 3113908877.0, + "step": 6090 + }, + { + "epoch": 1.64710654407788, + "grad_norm": 72.5, + "learning_rate": 0.016091103880699363, + "loss": 13.542, + "mean_token_accuracy": 0.002744111232459545, + "num_tokens": 3114425928.0, + "step": 6091 + }, + { + "epoch": 1.6473769605191997, + "grad_norm": 6.78125, + "learning_rate": 0.01608980378849571, + "loss": 3.7623, + "mean_token_accuracy": 0.30949968099594116, + "num_tokens": 3114950066.0, + "step": 6092 + }, + { + "epoch": 1.6476473769605193, + "grad_norm": 2.15625, + "learning_rate": 0.016088503540121903, + "loss": 3.2573, + "mean_token_accuracy": 0.3888084292411804, + "num_tokens": 3115462989.0, + "step": 6093 + }, + { + "epoch": 1.647917793401839, + "grad_norm": 3.8125, + "learning_rate": 0.016087203135617845, + "loss": 3.5521, + "mean_token_accuracy": 0.3565860092639923, + "num_tokens": 3115987245.0, + "step": 6094 + }, + { + "epoch": 1.6481882098431586, + "grad_norm": 2.796875, + "learning_rate": 0.016085902575023438, + "loss": 3.2544, + "mean_token_accuracy": 0.3750515580177307, + "num_tokens": 3116511348.0, + "step": 6095 + }, + { + "epoch": 1.6484586262844783, + "grad_norm": 4.15625, + "learning_rate": 0.01608460185837858, + "loss": 2.9921, + "mean_token_accuracy": 0.4111814796924591, + "num_tokens": 3116976515.0, + "step": 6096 + }, + { + "epoch": 1.6487290427257977, + "grad_norm": 2.859375, + "learning_rate": 0.016083300985723185, + "loss": 3.1973, + "mean_token_accuracy": 0.40768513083457947, + "num_tokens": 3117500707.0, + "step": 6097 + }, + { + "epoch": 1.6489994591671173, + "grad_norm": 3.390625, + "learning_rate": 0.016081999957097165, + "loss": 3.3134, + "mean_token_accuracy": 0.36114317178726196, + "num_tokens": 3118024986.0, + "step": 6098 + }, + { + "epoch": 1.649269875608437, + "grad_norm": 2.625, + "learning_rate": 0.016080698772540446, + "loss": 3.375, + "mean_token_accuracy": 0.38891148567199707, + "num_tokens": 3118543669.0, + "step": 6099 + }, + { + "epoch": 1.6495402920497566, + "grad_norm": 4.75, + "learning_rate": 0.01607939743209294, + "loss": 3.4966, + "mean_token_accuracy": 0.362258642911911, + "num_tokens": 3119067892.0, + "step": 6100 + }, + { + "epoch": 1.6498107084910762, + "grad_norm": 2.09375, + "learning_rate": 0.016078095935794588, + "loss": 3.2282, + "mean_token_accuracy": 0.39758193492889404, + "num_tokens": 3119592100.0, + "step": 6101 + }, + { + "epoch": 1.650081124932396, + "grad_norm": 3.75, + "learning_rate": 0.01607679428368532, + "loss": 3.0513, + "mean_token_accuracy": 0.44599735736846924, + "num_tokens": 3120116318.0, + "step": 6102 + }, + { + "epoch": 1.6503515413737155, + "grad_norm": 3.28125, + "learning_rate": 0.016075492475805073, + "loss": 3.3958, + "mean_token_accuracy": 0.379440039396286, + "num_tokens": 3120640579.0, + "step": 6103 + }, + { + "epoch": 1.6506219578150352, + "grad_norm": 3.4375, + "learning_rate": 0.016074190512193786, + "loss": 3.314, + "mean_token_accuracy": 0.4088343381881714, + "num_tokens": 3121092259.0, + "step": 6104 + }, + { + "epoch": 1.6508923742563548, + "grad_norm": 3.359375, + "learning_rate": 0.016072888392891418, + "loss": 3.3765, + "mean_token_accuracy": 0.36774569749832153, + "num_tokens": 3121616474.0, + "step": 6105 + }, + { + "epoch": 1.6511627906976745, + "grad_norm": 2.734375, + "learning_rate": 0.01607158611793791, + "loss": 2.9537, + "mean_token_accuracy": 0.3862473666667938, + "num_tokens": 3122140717.0, + "step": 6106 + }, + { + "epoch": 1.6514332071389939, + "grad_norm": 2.53125, + "learning_rate": 0.01607028368737323, + "loss": 3.0868, + "mean_token_accuracy": 0.40516412258148193, + "num_tokens": 3122619284.0, + "step": 6107 + }, + { + "epoch": 1.6517036235803135, + "grad_norm": 3.0, + "learning_rate": 0.016068981101237337, + "loss": 3.3606, + "mean_token_accuracy": 0.37696152925491333, + "num_tokens": 3123129372.0, + "step": 6108 + }, + { + "epoch": 1.6519740400216332, + "grad_norm": 3.890625, + "learning_rate": 0.01606767835957019, + "loss": 3.2158, + "mean_token_accuracy": 0.40456920862197876, + "num_tokens": 3123593454.0, + "step": 6109 + }, + { + "epoch": 1.6522444564629528, + "grad_norm": 3.1875, + "learning_rate": 0.016066375462411773, + "loss": 3.3026, + "mean_token_accuracy": 0.3690507411956787, + "num_tokens": 3124117713.0, + "step": 6110 + }, + { + "epoch": 1.6525148729042725, + "grad_norm": 23.875, + "learning_rate": 0.016065072409802057, + "loss": 10.6286, + "mean_token_accuracy": 0.00494425930082798, + "num_tokens": 3124641945.0, + "step": 6111 + }, + { + "epoch": 1.652785289345592, + "grad_norm": 9.125, + "learning_rate": 0.016063769201781024, + "loss": 4.0345, + "mean_token_accuracy": 0.3066727817058563, + "num_tokens": 3125129406.0, + "step": 6112 + }, + { + "epoch": 1.6530557057869117, + "grad_norm": 36.5, + "learning_rate": 0.01606246583838866, + "loss": 3.2977, + "mean_token_accuracy": 0.3903990685939789, + "num_tokens": 3125653652.0, + "step": 6113 + }, + { + "epoch": 1.6533261222282314, + "grad_norm": 3.03125, + "learning_rate": 0.016061162319664955, + "loss": 3.5197, + "mean_token_accuracy": 0.3566410541534424, + "num_tokens": 3126177829.0, + "step": 6114 + }, + { + "epoch": 1.653596538669551, + "grad_norm": 2.78125, + "learning_rate": 0.01605985864564991, + "loss": 3.543, + "mean_token_accuracy": 0.3491044044494629, + "num_tokens": 3126644596.0, + "step": 6115 + }, + { + "epoch": 1.6538669551108707, + "grad_norm": 4.21875, + "learning_rate": 0.01605855481638352, + "loss": 3.5776, + "mean_token_accuracy": 0.3589310646057129, + "num_tokens": 3127168842.0, + "step": 6116 + }, + { + "epoch": 1.6541373715521903, + "grad_norm": 3.484375, + "learning_rate": 0.016057250831905786, + "loss": 3.5624, + "mean_token_accuracy": 0.3430943489074707, + "num_tokens": 3127693090.0, + "step": 6117 + }, + { + "epoch": 1.65440778799351, + "grad_norm": 2.234375, + "learning_rate": 0.016055946692256733, + "loss": 3.2198, + "mean_token_accuracy": 0.4002783000469208, + "num_tokens": 3128217216.0, + "step": 6118 + }, + { + "epoch": 1.6546782044348296, + "grad_norm": 2.953125, + "learning_rate": 0.016054642397476362, + "loss": 3.1315, + "mean_token_accuracy": 0.38833221793174744, + "num_tokens": 3128666014.0, + "step": 6119 + }, + { + "epoch": 1.6549486208761492, + "grad_norm": 2.703125, + "learning_rate": 0.016053337947604697, + "loss": 3.3957, + "mean_token_accuracy": 0.3714269995689392, + "num_tokens": 3129190260.0, + "step": 6120 + }, + { + "epoch": 1.6552190373174689, + "grad_norm": 3.515625, + "learning_rate": 0.01605203334268176, + "loss": 3.5383, + "mean_token_accuracy": 0.363588809967041, + "num_tokens": 3129714523.0, + "step": 6121 + }, + { + "epoch": 1.6554894537587885, + "grad_norm": 2.734375, + "learning_rate": 0.01605072858274759, + "loss": 3.4113, + "mean_token_accuracy": 0.37105169892311096, + "num_tokens": 3130229123.0, + "step": 6122 + }, + { + "epoch": 1.6557598702001082, + "grad_norm": 2.46875, + "learning_rate": 0.016049423667842212, + "loss": 3.2722, + "mean_token_accuracy": 0.3815903663635254, + "num_tokens": 3130753348.0, + "step": 6123 + }, + { + "epoch": 1.6560302866414278, + "grad_norm": 2.40625, + "learning_rate": 0.016048118598005666, + "loss": 3.2242, + "mean_token_accuracy": 0.3733642101287842, + "num_tokens": 3131277603.0, + "step": 6124 + }, + { + "epoch": 1.6563007030827475, + "grad_norm": 2.890625, + "learning_rate": 0.016046813373278, + "loss": 3.4954, + "mean_token_accuracy": 0.38977891206741333, + "num_tokens": 3131741677.0, + "step": 6125 + }, + { + "epoch": 1.656571119524067, + "grad_norm": 2.78125, + "learning_rate": 0.016045507993699252, + "loss": 3.1356, + "mean_token_accuracy": 0.4042041003704071, + "num_tokens": 3132265930.0, + "step": 6126 + }, + { + "epoch": 1.6568415359653867, + "grad_norm": 2.53125, + "learning_rate": 0.01604420245930948, + "loss": 3.0881, + "mean_token_accuracy": 0.3959839344024658, + "num_tokens": 3132790114.0, + "step": 6127 + }, + { + "epoch": 1.6571119524067064, + "grad_norm": 2.734375, + "learning_rate": 0.016042896770148744, + "loss": 3.0588, + "mean_token_accuracy": 0.39321911334991455, + "num_tokens": 3133302475.0, + "step": 6128 + }, + { + "epoch": 1.657382368848026, + "grad_norm": 2.15625, + "learning_rate": 0.016041590926257107, + "loss": 3.2903, + "mean_token_accuracy": 0.36304742097854614, + "num_tokens": 3133826750.0, + "step": 6129 + }, + { + "epoch": 1.6576527852893457, + "grad_norm": 3.265625, + "learning_rate": 0.016040284927674633, + "loss": 3.3373, + "mean_token_accuracy": 0.37960416078567505, + "num_tokens": 3134350905.0, + "step": 6130 + }, + { + "epoch": 1.6579232017306653, + "grad_norm": 125.0, + "learning_rate": 0.016038978774441395, + "loss": 14.1018, + "mean_token_accuracy": 3.382217755643069e-06, + "num_tokens": 3134875014.0, + "step": 6131 + }, + { + "epoch": 1.658193618171985, + "grad_norm": 6.46875, + "learning_rate": 0.016037672466597472, + "loss": 3.6488, + "mean_token_accuracy": 0.3570335805416107, + "num_tokens": 3135399215.0, + "step": 6132 + }, + { + "epoch": 1.6584640346133046, + "grad_norm": 2.25, + "learning_rate": 0.01603636600418294, + "loss": 3.1711, + "mean_token_accuracy": 0.40959423780441284, + "num_tokens": 3135860554.0, + "step": 6133 + }, + { + "epoch": 1.6587344510546242, + "grad_norm": 3.09375, + "learning_rate": 0.016035059387237885, + "loss": 3.2508, + "mean_token_accuracy": 0.3712989389896393, + "num_tokens": 3136384585.0, + "step": 6134 + }, + { + "epoch": 1.659004867495944, + "grad_norm": 2.75, + "learning_rate": 0.016033752615802404, + "loss": 3.1781, + "mean_token_accuracy": 0.3804023861885071, + "num_tokens": 3136908859.0, + "step": 6135 + }, + { + "epoch": 1.6592752839372635, + "grad_norm": 3.390625, + "learning_rate": 0.01603244568991659, + "loss": 3.3453, + "mean_token_accuracy": 0.3898414969444275, + "num_tokens": 3137413974.0, + "step": 6136 + }, + { + "epoch": 1.6595457003785832, + "grad_norm": 3.0, + "learning_rate": 0.016031138609620542, + "loss": 3.1752, + "mean_token_accuracy": 0.3860766887664795, + "num_tokens": 3137886562.0, + "step": 6137 + }, + { + "epoch": 1.6598161168199026, + "grad_norm": 3.28125, + "learning_rate": 0.016029831374954367, + "loss": 3.2987, + "mean_token_accuracy": 0.3716273903846741, + "num_tokens": 3138410721.0, + "step": 6138 + }, + { + "epoch": 1.6600865332612222, + "grad_norm": 2.46875, + "learning_rate": 0.016028523985958172, + "loss": 3.289, + "mean_token_accuracy": 0.37193161249160767, + "num_tokens": 3138934793.0, + "step": 6139 + }, + { + "epoch": 1.6603569497025419, + "grad_norm": 3.015625, + "learning_rate": 0.016027216442672073, + "loss": 3.3841, + "mean_token_accuracy": 0.39778536558151245, + "num_tokens": 3139459063.0, + "step": 6140 + }, + { + "epoch": 1.6606273661438615, + "grad_norm": 2.859375, + "learning_rate": 0.01602590874513619, + "loss": 3.1874, + "mean_token_accuracy": 0.3716574013233185, + "num_tokens": 3139983245.0, + "step": 6141 + }, + { + "epoch": 1.6608977825851812, + "grad_norm": 2.515625, + "learning_rate": 0.01602460089339065, + "loss": 3.0196, + "mean_token_accuracy": 0.3964812755584717, + "num_tokens": 3140507391.0, + "step": 6142 + }, + { + "epoch": 1.6611681990265008, + "grad_norm": 2.75, + "learning_rate": 0.016023292887475574, + "loss": 3.3742, + "mean_token_accuracy": 0.3681606948375702, + "num_tokens": 3141031590.0, + "step": 6143 + }, + { + "epoch": 1.6614386154678205, + "grad_norm": 3.15625, + "learning_rate": 0.0160219847274311, + "loss": 3.392, + "mean_token_accuracy": 0.3869357407093048, + "num_tokens": 3141525466.0, + "step": 6144 + }, + { + "epoch": 1.66170903190914, + "grad_norm": 2.953125, + "learning_rate": 0.01602067641329737, + "loss": 3.553, + "mean_token_accuracy": 0.35170280933380127, + "num_tokens": 3142049699.0, + "step": 6145 + }, + { + "epoch": 1.6619794483504597, + "grad_norm": 3.109375, + "learning_rate": 0.016019367945114514, + "loss": 3.3115, + "mean_token_accuracy": 0.37406396865844727, + "num_tokens": 3142573855.0, + "step": 6146 + }, + { + "epoch": 1.6622498647917794, + "grad_norm": 2.59375, + "learning_rate": 0.016018059322922697, + "loss": 3.2311, + "mean_token_accuracy": 0.37328094244003296, + "num_tokens": 3143098012.0, + "step": 6147 + }, + { + "epoch": 1.6625202812330988, + "grad_norm": 2.21875, + "learning_rate": 0.01601675054676206, + "loss": 3.2747, + "mean_token_accuracy": 0.38226890563964844, + "num_tokens": 3143622290.0, + "step": 6148 + }, + { + "epoch": 1.6627906976744184, + "grad_norm": 2.8125, + "learning_rate": 0.016015441616672763, + "loss": 3.3869, + "mean_token_accuracy": 0.39068400859832764, + "num_tokens": 3144091817.0, + "step": 6149 + }, + { + "epoch": 1.663061114115738, + "grad_norm": 2.203125, + "learning_rate": 0.016014132532694965, + "loss": 3.0549, + "mean_token_accuracy": 0.4023504853248596, + "num_tokens": 3144597815.0, + "step": 6150 + }, + { + "epoch": 1.6633315305570577, + "grad_norm": 1.71875, + "learning_rate": 0.01601282329486884, + "loss": 9.8056, + "mean_token_accuracy": 0.0005704668583348393, + "num_tokens": 3145122064.0, + "step": 6151 + }, + { + "epoch": 1.6636019469983774, + "grad_norm": 7.40625, + "learning_rate": 0.01601151390323455, + "loss": 3.6898, + "mean_token_accuracy": 0.35674428939819336, + "num_tokens": 3145646183.0, + "step": 6152 + }, + { + "epoch": 1.663872363439697, + "grad_norm": 3.4375, + "learning_rate": 0.01601020435783228, + "loss": 3.4303, + "mean_token_accuracy": 0.375390887260437, + "num_tokens": 3146170426.0, + "step": 6153 + }, + { + "epoch": 1.6641427798810167, + "grad_norm": 3.0625, + "learning_rate": 0.016008894658702203, + "loss": 3.3437, + "mean_token_accuracy": 0.38116246461868286, + "num_tokens": 3146694583.0, + "step": 6154 + }, + { + "epoch": 1.6644131963223363, + "grad_norm": 3.140625, + "learning_rate": 0.01600758480588451, + "loss": 3.4738, + "mean_token_accuracy": 0.3712712824344635, + "num_tokens": 3147168116.0, + "step": 6155 + }, + { + "epoch": 1.664683612763656, + "grad_norm": 2.53125, + "learning_rate": 0.016006274799419386, + "loss": 3.3477, + "mean_token_accuracy": 0.3784940838813782, + "num_tokens": 3147692249.0, + "step": 6156 + }, + { + "epoch": 1.6649540292049756, + "grad_norm": 3.390625, + "learning_rate": 0.01600496463934703, + "loss": 3.3439, + "mean_token_accuracy": 0.37406694889068604, + "num_tokens": 3148216448.0, + "step": 6157 + }, + { + "epoch": 1.6652244456462952, + "grad_norm": 2.921875, + "learning_rate": 0.01600365432570764, + "loss": 3.2646, + "mean_token_accuracy": 0.3860073387622833, + "num_tokens": 3148740651.0, + "step": 6158 + }, + { + "epoch": 1.6654948620876149, + "grad_norm": 3.15625, + "learning_rate": 0.01600234385854142, + "loss": 3.3027, + "mean_token_accuracy": 0.3931804597377777, + "num_tokens": 3149264886.0, + "step": 6159 + }, + { + "epoch": 1.6657652785289345, + "grad_norm": 3.734375, + "learning_rate": 0.01600103323788858, + "loss": 3.1671, + "mean_token_accuracy": 0.3932841420173645, + "num_tokens": 3149789063.0, + "step": 6160 + }, + { + "epoch": 1.6660356949702542, + "grad_norm": 3.140625, + "learning_rate": 0.015999722463789338, + "loss": 3.2511, + "mean_token_accuracy": 0.3998414874076843, + "num_tokens": 3150313342.0, + "step": 6161 + }, + { + "epoch": 1.6663061114115738, + "grad_norm": 2.703125, + "learning_rate": 0.0159984115362839, + "loss": 3.2632, + "mean_token_accuracy": 0.37064141035079956, + "num_tokens": 3150837590.0, + "step": 6162 + }, + { + "epoch": 1.6665765278528935, + "grad_norm": 3.703125, + "learning_rate": 0.0159971004554125, + "loss": 2.9137, + "mean_token_accuracy": 0.4046700894832611, + "num_tokens": 3151361584.0, + "step": 6163 + }, + { + "epoch": 1.666846944294213, + "grad_norm": 2.65625, + "learning_rate": 0.015995789221215362, + "loss": 3.2628, + "mean_token_accuracy": 0.3985304832458496, + "num_tokens": 3151885650.0, + "step": 6164 + }, + { + "epoch": 1.6671173607355327, + "grad_norm": 3.390625, + "learning_rate": 0.015994477833732722, + "loss": 3.2443, + "mean_token_accuracy": 0.3662906885147095, + "num_tokens": 3152409933.0, + "step": 6165 + }, + { + "epoch": 1.6673877771768524, + "grad_norm": 3.3125, + "learning_rate": 0.01599316629300481, + "loss": 3.1376, + "mean_token_accuracy": 0.4090997278690338, + "num_tokens": 3152934070.0, + "step": 6166 + }, + { + "epoch": 1.667658193618172, + "grad_norm": 71.0, + "learning_rate": 0.015991854599071872, + "loss": 4.5775, + "mean_token_accuracy": 0.33229851722717285, + "num_tokens": 3153458255.0, + "step": 6167 + }, + { + "epoch": 1.6679286100594917, + "grad_norm": 14.25, + "learning_rate": 0.015990542751974157, + "loss": 3.1701, + "mean_token_accuracy": 0.43908554315567017, + "num_tokens": 3153982498.0, + "step": 6168 + }, + { + "epoch": 1.6681990265008113, + "grad_norm": 2.859375, + "learning_rate": 0.015989230751751914, + "loss": 3.6088, + "mean_token_accuracy": 0.35066866874694824, + "num_tokens": 3154506753.0, + "step": 6169 + }, + { + "epoch": 1.668469442942131, + "grad_norm": 2.859375, + "learning_rate": 0.015987918598445398, + "loss": 3.1419, + "mean_token_accuracy": 0.36562323570251465, + "num_tokens": 3154993879.0, + "step": 6170 + }, + { + "epoch": 1.6687398593834506, + "grad_norm": 65.5, + "learning_rate": 0.015986606292094873, + "loss": 12.123, + "mean_token_accuracy": 0.014240014366805553, + "num_tokens": 3155518067.0, + "step": 6171 + }, + { + "epoch": 1.6690102758247702, + "grad_norm": 8.875, + "learning_rate": 0.015985293832740603, + "loss": 4.0069, + "mean_token_accuracy": 0.32041943073272705, + "num_tokens": 3156042267.0, + "step": 6172 + }, + { + "epoch": 1.6692806922660899, + "grad_norm": 3.953125, + "learning_rate": 0.015983981220422855, + "loss": 3.4955, + "mean_token_accuracy": 0.37414950132369995, + "num_tokens": 3156566540.0, + "step": 6173 + }, + { + "epoch": 1.6695511087074095, + "grad_norm": 2.4375, + "learning_rate": 0.015982668455181908, + "loss": 3.568, + "mean_token_accuracy": 0.3805314004421234, + "num_tokens": 3157055061.0, + "step": 6174 + }, + { + "epoch": 1.6698215251487292, + "grad_norm": 3.078125, + "learning_rate": 0.015981355537058036, + "loss": 3.5976, + "mean_token_accuracy": 0.36820554733276367, + "num_tokens": 3157517893.0, + "step": 6175 + }, + { + "epoch": 1.6700919415900488, + "grad_norm": 2.671875, + "learning_rate": 0.01598004246609153, + "loss": 3.3322, + "mean_token_accuracy": 0.3647196292877197, + "num_tokens": 3157993781.0, + "step": 6176 + }, + { + "epoch": 1.6703623580313685, + "grad_norm": 3.265625, + "learning_rate": 0.015978729242322673, + "loss": 3.166, + "mean_token_accuracy": 0.377019464969635, + "num_tokens": 3158518034.0, + "step": 6177 + }, + { + "epoch": 1.670632774472688, + "grad_norm": 2.234375, + "learning_rate": 0.015977415865791766, + "loss": 3.0341, + "mean_token_accuracy": 0.3779143691062927, + "num_tokens": 3159030521.0, + "step": 6178 + }, + { + "epoch": 1.6709031909140075, + "grad_norm": 2.984375, + "learning_rate": 0.0159761023365391, + "loss": 3.3679, + "mean_token_accuracy": 0.4189361333847046, + "num_tokens": 3159453455.0, + "step": 6179 + }, + { + "epoch": 1.6711736073553272, + "grad_norm": 5.65625, + "learning_rate": 0.015974788654604978, + "loss": 3.4514, + "mean_token_accuracy": 0.3763853907585144, + "num_tokens": 3159977612.0, + "step": 6180 + }, + { + "epoch": 1.6714440237966468, + "grad_norm": 3.890625, + "learning_rate": 0.015973474820029716, + "loss": 3.579, + "mean_token_accuracy": 0.40387552976608276, + "num_tokens": 3160473621.0, + "step": 6181 + }, + { + "epoch": 1.6717144402379664, + "grad_norm": 3.59375, + "learning_rate": 0.015972160832853615, + "loss": 3.1875, + "mean_token_accuracy": 0.3831397294998169, + "num_tokens": 3160997902.0, + "step": 6182 + }, + { + "epoch": 1.671984856679286, + "grad_norm": 2.359375, + "learning_rate": 0.015970846693117, + "loss": 3.4671, + "mean_token_accuracy": 0.3678237199783325, + "num_tokens": 3161522112.0, + "step": 6183 + }, + { + "epoch": 1.6722552731206057, + "grad_norm": 3.390625, + "learning_rate": 0.015969532400860194, + "loss": 3.4669, + "mean_token_accuracy": 0.33510681986808777, + "num_tokens": 3162046376.0, + "step": 6184 + }, + { + "epoch": 1.6725256895619254, + "grad_norm": 2.640625, + "learning_rate": 0.015968217956123514, + "loss": 3.45, + "mean_token_accuracy": 0.36606404185295105, + "num_tokens": 3162570627.0, + "step": 6185 + }, + { + "epoch": 1.672796106003245, + "grad_norm": 3.0, + "learning_rate": 0.015966903358947297, + "loss": 3.4053, + "mean_token_accuracy": 0.37962353229522705, + "num_tokens": 3163094851.0, + "step": 6186 + }, + { + "epoch": 1.6730665224445647, + "grad_norm": 2.671875, + "learning_rate": 0.01596558860937188, + "loss": 3.1983, + "mean_token_accuracy": 0.3944564461708069, + "num_tokens": 3163619079.0, + "step": 6187 + }, + { + "epoch": 1.6733369388858843, + "grad_norm": 3.90625, + "learning_rate": 0.015964273707437603, + "loss": 3.3368, + "mean_token_accuracy": 0.37933552265167236, + "num_tokens": 3164143338.0, + "step": 6188 + }, + { + "epoch": 1.673607355327204, + "grad_norm": 2.84375, + "learning_rate": 0.015962958653184804, + "loss": 3.2453, + "mean_token_accuracy": 0.38406598567962646, + "num_tokens": 3164637648.0, + "step": 6189 + }, + { + "epoch": 1.6738777717685234, + "grad_norm": 3.0, + "learning_rate": 0.015961643446653843, + "loss": 3.3168, + "mean_token_accuracy": 0.38516589999198914, + "num_tokens": 3165161927.0, + "step": 6190 + }, + { + "epoch": 1.674148188209843, + "grad_norm": 66.5, + "learning_rate": 0.015960328087885067, + "loss": 12.1156, + "mean_token_accuracy": 0.015206960029900074, + "num_tokens": 3165686211.0, + "step": 6191 + }, + { + "epoch": 1.6744186046511627, + "grad_norm": 8.3125, + "learning_rate": 0.01595901257691884, + "loss": 3.876, + "mean_token_accuracy": 0.3610254228115082, + "num_tokens": 3166174050.0, + "step": 6192 + }, + { + "epoch": 1.6746890210924823, + "grad_norm": 3.203125, + "learning_rate": 0.015957696913795522, + "loss": 3.553, + "mean_token_accuracy": 0.32969534397125244, + "num_tokens": 3166698289.0, + "step": 6193 + }, + { + "epoch": 1.674959437533802, + "grad_norm": 2.1875, + "learning_rate": 0.01595638109855549, + "loss": 3.2682, + "mean_token_accuracy": 0.39031827449798584, + "num_tokens": 3167195388.0, + "step": 6194 + }, + { + "epoch": 1.6752298539751216, + "grad_norm": 2.875, + "learning_rate": 0.0159550651312391, + "loss": 3.2705, + "mean_token_accuracy": 0.38716888427734375, + "num_tokens": 3167719505.0, + "step": 6195 + }, + { + "epoch": 1.6755002704164412, + "grad_norm": 2.328125, + "learning_rate": 0.015953749011886745, + "loss": 3.2952, + "mean_token_accuracy": 0.38053038716316223, + "num_tokens": 3168243712.0, + "step": 6196 + }, + { + "epoch": 1.6757706868577609, + "grad_norm": 2.84375, + "learning_rate": 0.015952432740538803, + "loss": 3.1786, + "mean_token_accuracy": 0.40602895617485046, + "num_tokens": 3168767900.0, + "step": 6197 + }, + { + "epoch": 1.6760411032990805, + "grad_norm": 3.140625, + "learning_rate": 0.01595111631723566, + "loss": 3.2832, + "mean_token_accuracy": 0.387445867061615, + "num_tokens": 3169235578.0, + "step": 6198 + }, + { + "epoch": 1.6763115197404002, + "grad_norm": 8.25, + "learning_rate": 0.015949799742017705, + "loss": 3.4216, + "mean_token_accuracy": 0.3805375397205353, + "num_tokens": 3169759688.0, + "step": 6199 + }, + { + "epoch": 1.6765819361817198, + "grad_norm": 2.15625, + "learning_rate": 0.015948483014925338, + "loss": 3.3014, + "mean_token_accuracy": 0.3836117386817932, + "num_tokens": 3170283954.0, + "step": 6200 + }, + { + "epoch": 1.6768523526230394, + "grad_norm": 2.34375, + "learning_rate": 0.015947166135998964, + "loss": 3.167, + "mean_token_accuracy": 0.40366631746292114, + "num_tokens": 3170752137.0, + "step": 6201 + }, + { + "epoch": 1.677122769064359, + "grad_norm": 2.765625, + "learning_rate": 0.01594584910527898, + "loss": 2.9857, + "mean_token_accuracy": 0.4020196199417114, + "num_tokens": 3171248285.0, + "step": 6202 + }, + { + "epoch": 1.6773931855056787, + "grad_norm": 2.84375, + "learning_rate": 0.015944531922805802, + "loss": 3.3121, + "mean_token_accuracy": 0.3755907416343689, + "num_tokens": 3171770385.0, + "step": 6203 + }, + { + "epoch": 1.6776636019469984, + "grad_norm": 3.59375, + "learning_rate": 0.015943214588619845, + "loss": 3.3763, + "mean_token_accuracy": 0.36859917640686035, + "num_tokens": 3172279684.0, + "step": 6204 + }, + { + "epoch": 1.677934018388318, + "grad_norm": 3.59375, + "learning_rate": 0.015941897102761525, + "loss": 3.1663, + "mean_token_accuracy": 0.40557655692100525, + "num_tokens": 3172771207.0, + "step": 6205 + }, + { + "epoch": 1.6782044348296377, + "grad_norm": 3.0, + "learning_rate": 0.015940579465271273, + "loss": 3.3796, + "mean_token_accuracy": 0.37334543466567993, + "num_tokens": 3173295366.0, + "step": 6206 + }, + { + "epoch": 1.6784748512709573, + "grad_norm": 3.0625, + "learning_rate": 0.015939261676189507, + "loss": 3.4906, + "mean_token_accuracy": 0.36078089475631714, + "num_tokens": 3173819636.0, + "step": 6207 + }, + { + "epoch": 1.678745267712277, + "grad_norm": 2.25, + "learning_rate": 0.015937943735556674, + "loss": 3.2529, + "mean_token_accuracy": 0.3811771869659424, + "num_tokens": 3174343763.0, + "step": 6208 + }, + { + "epoch": 1.6790156841535966, + "grad_norm": 2.578125, + "learning_rate": 0.0159366256434132, + "loss": 3.4461, + "mean_token_accuracy": 0.37635838985443115, + "num_tokens": 3174867928.0, + "step": 6209 + }, + { + "epoch": 1.6792861005949162, + "grad_norm": 3.203125, + "learning_rate": 0.015935307399799536, + "loss": 3.1558, + "mean_token_accuracy": 0.3975028991699219, + "num_tokens": 3175354353.0, + "step": 6210 + }, + { + "epoch": 1.6795565170362359, + "grad_norm": 35.0, + "learning_rate": 0.01593398900475613, + "loss": 10.5231, + "mean_token_accuracy": 0.010909218341112137, + "num_tokens": 3175815758.0, + "step": 6211 + }, + { + "epoch": 1.6798269334775555, + "grad_norm": 8.125, + "learning_rate": 0.015932670458323427, + "loss": 3.7357, + "mean_token_accuracy": 0.3535084128379822, + "num_tokens": 3176339946.0, + "step": 6212 + }, + { + "epoch": 1.6800973499188752, + "grad_norm": 3.28125, + "learning_rate": 0.015931351760541886, + "loss": 3.3562, + "mean_token_accuracy": 0.40213900804519653, + "num_tokens": 3176794238.0, + "step": 6213 + }, + { + "epoch": 1.6803677663601948, + "grad_norm": 2.703125, + "learning_rate": 0.01593003291145198, + "loss": 3.3923, + "mean_token_accuracy": 0.35535115003585815, + "num_tokens": 3177318524.0, + "step": 6214 + }, + { + "epoch": 1.6806381828015144, + "grad_norm": 3.34375, + "learning_rate": 0.01592871391109416, + "loss": 3.1612, + "mean_token_accuracy": 0.40031158924102783, + "num_tokens": 3177842673.0, + "step": 6215 + }, + { + "epoch": 1.680908599242834, + "grad_norm": 2.796875, + "learning_rate": 0.015927394759508905, + "loss": 3.1586, + "mean_token_accuracy": 0.40092456340789795, + "num_tokens": 3178322054.0, + "step": 6216 + }, + { + "epoch": 1.6811790156841537, + "grad_norm": 3.171875, + "learning_rate": 0.015926075456736685, + "loss": 3.212, + "mean_token_accuracy": 0.36926549673080444, + "num_tokens": 3178846196.0, + "step": 6217 + }, + { + "epoch": 1.6814494321254734, + "grad_norm": 2.421875, + "learning_rate": 0.015924756002817987, + "loss": 3.5312, + "mean_token_accuracy": 0.3628590404987335, + "num_tokens": 3179370429.0, + "step": 6218 + }, + { + "epoch": 1.681719848566793, + "grad_norm": 2.9375, + "learning_rate": 0.015923436397793287, + "loss": 3.3255, + "mean_token_accuracy": 0.3757980465888977, + "num_tokens": 3179894694.0, + "step": 6219 + }, + { + "epoch": 1.6819902650081124, + "grad_norm": 3.28125, + "learning_rate": 0.015922116641703087, + "loss": 3.1585, + "mean_token_accuracy": 0.3960219621658325, + "num_tokens": 3180367609.0, + "step": 6220 + }, + { + "epoch": 1.682260681449432, + "grad_norm": 2.546875, + "learning_rate": 0.015920796734587867, + "loss": 3.2604, + "mean_token_accuracy": 0.37949705123901367, + "num_tokens": 3180891875.0, + "step": 6221 + }, + { + "epoch": 1.6825310978907517, + "grad_norm": 2.875, + "learning_rate": 0.015919476676488135, + "loss": 3.2175, + "mean_token_accuracy": 0.3692089319229126, + "num_tokens": 3181416123.0, + "step": 6222 + }, + { + "epoch": 1.6828015143320714, + "grad_norm": 3.53125, + "learning_rate": 0.015918156467444388, + "loss": 3.2282, + "mean_token_accuracy": 0.3773708641529083, + "num_tokens": 3181940358.0, + "step": 6223 + }, + { + "epoch": 1.683071930773391, + "grad_norm": 2.96875, + "learning_rate": 0.01591683610749714, + "loss": 3.271, + "mean_token_accuracy": 0.38462522625923157, + "num_tokens": 3182464431.0, + "step": 6224 + }, + { + "epoch": 1.6833423472147107, + "grad_norm": 2.84375, + "learning_rate": 0.015915515596686903, + "loss": 3.414, + "mean_token_accuracy": 0.35283321142196655, + "num_tokens": 3182988683.0, + "step": 6225 + }, + { + "epoch": 1.6836127636560303, + "grad_norm": 2.703125, + "learning_rate": 0.01591419493505419, + "loss": 3.3014, + "mean_token_accuracy": 0.3785715103149414, + "num_tokens": 3183512902.0, + "step": 6226 + }, + { + "epoch": 1.68388318009735, + "grad_norm": 2.59375, + "learning_rate": 0.015912874122639524, + "loss": 3.3797, + "mean_token_accuracy": 0.37089893221855164, + "num_tokens": 3184036977.0, + "step": 6227 + }, + { + "epoch": 1.6841535965386696, + "grad_norm": 2.671875, + "learning_rate": 0.015911553159483432, + "loss": 3.4932, + "mean_token_accuracy": 0.3656950294971466, + "num_tokens": 3184561235.0, + "step": 6228 + }, + { + "epoch": 1.6844240129799892, + "grad_norm": 9.75, + "learning_rate": 0.015910232045626443, + "loss": 3.4344, + "mean_token_accuracy": 0.395678848028183, + "num_tokens": 3185085398.0, + "step": 6229 + }, + { + "epoch": 1.6846944294213089, + "grad_norm": 2.140625, + "learning_rate": 0.0159089107811091, + "loss": 3.2734, + "mean_token_accuracy": 0.37734490633010864, + "num_tokens": 3185609684.0, + "step": 6230 + }, + { + "epoch": 1.6849648458626283, + "grad_norm": 14.6875, + "learning_rate": 0.015907589365971937, + "loss": 11.0016, + "mean_token_accuracy": 2.168331411667168e-05, + "num_tokens": 3186133924.0, + "step": 6231 + }, + { + "epoch": 1.685235262303948, + "grad_norm": 7.875, + "learning_rate": 0.015906267800255496, + "loss": 3.9173, + "mean_token_accuracy": 0.3475378453731537, + "num_tokens": 3186510050.0, + "step": 6232 + }, + { + "epoch": 1.6855056787452676, + "grad_norm": 2.296875, + "learning_rate": 0.015904946084000334, + "loss": 3.396, + "mean_token_accuracy": 0.3836500644683838, + "num_tokens": 3186971507.0, + "step": 6233 + }, + { + "epoch": 1.6857760951865872, + "grad_norm": 2.25, + "learning_rate": 0.015903624217247, + "loss": 3.1844, + "mean_token_accuracy": 0.3849719166755676, + "num_tokens": 3187495696.0, + "step": 6234 + }, + { + "epoch": 1.6860465116279069, + "grad_norm": 2.75, + "learning_rate": 0.01590230220003605, + "loss": 3.1022, + "mean_token_accuracy": 0.40905943512916565, + "num_tokens": 3187906105.0, + "step": 6235 + }, + { + "epoch": 1.6863169280692265, + "grad_norm": 2.640625, + "learning_rate": 0.015900980032408057, + "loss": 3.3378, + "mean_token_accuracy": 0.3714568018913269, + "num_tokens": 3188430333.0, + "step": 6236 + }, + { + "epoch": 1.6865873445105461, + "grad_norm": 3.5625, + "learning_rate": 0.015899657714403586, + "loss": 3.0573, + "mean_token_accuracy": 0.3702983260154724, + "num_tokens": 3188942187.0, + "step": 6237 + }, + { + "epoch": 1.6868577609518658, + "grad_norm": 2.921875, + "learning_rate": 0.015898335246063204, + "loss": 3.2078, + "mean_token_accuracy": 0.3730202317237854, + "num_tokens": 3189439237.0, + "step": 6238 + }, + { + "epoch": 1.6871281773931854, + "grad_norm": 2.953125, + "learning_rate": 0.01589701262742749, + "loss": 3.2674, + "mean_token_accuracy": 0.39208707213401794, + "num_tokens": 3189942514.0, + "step": 6239 + }, + { + "epoch": 1.687398593834505, + "grad_norm": 2.859375, + "learning_rate": 0.01589568985853703, + "loss": 3.2472, + "mean_token_accuracy": 0.3669710159301758, + "num_tokens": 3190466695.0, + "step": 6240 + }, + { + "epoch": 1.6876690102758247, + "grad_norm": 2.4375, + "learning_rate": 0.01589436693943241, + "loss": 2.9454, + "mean_token_accuracy": 0.40283679962158203, + "num_tokens": 3190957007.0, + "step": 6241 + }, + { + "epoch": 1.6879394267171444, + "grad_norm": 2.765625, + "learning_rate": 0.015893043870154215, + "loss": 3.4594, + "mean_token_accuracy": 0.39008426666259766, + "num_tokens": 3191387526.0, + "step": 6242 + }, + { + "epoch": 1.688209843158464, + "grad_norm": 3.03125, + "learning_rate": 0.015891720650743048, + "loss": 3.4947, + "mean_token_accuracy": 0.3632868826389313, + "num_tokens": 3191911769.0, + "step": 6243 + }, + { + "epoch": 1.6884802595997837, + "grad_norm": 2.96875, + "learning_rate": 0.015890397281239504, + "loss": 3.377, + "mean_token_accuracy": 0.3730108141899109, + "num_tokens": 3192435960.0, + "step": 6244 + }, + { + "epoch": 1.6887506760411033, + "grad_norm": 2.78125, + "learning_rate": 0.015889073761684188, + "loss": 3.3335, + "mean_token_accuracy": 0.3782195448875427, + "num_tokens": 3192960139.0, + "step": 6245 + }, + { + "epoch": 1.689021092482423, + "grad_norm": 2.328125, + "learning_rate": 0.015887750092117715, + "loss": 3.152, + "mean_token_accuracy": 0.407634973526001, + "num_tokens": 3193458033.0, + "step": 6246 + }, + { + "epoch": 1.6892915089237426, + "grad_norm": 3.03125, + "learning_rate": 0.015886426272580693, + "loss": 3.4158, + "mean_token_accuracy": 0.37405925989151, + "num_tokens": 3193982157.0, + "step": 6247 + }, + { + "epoch": 1.6895619253650622, + "grad_norm": 2.53125, + "learning_rate": 0.01588510230311374, + "loss": 3.1161, + "mean_token_accuracy": 0.39907386898994446, + "num_tokens": 3194506425.0, + "step": 6248 + }, + { + "epoch": 1.6898323418063819, + "grad_norm": 3.6875, + "learning_rate": 0.015883778183757486, + "loss": 3.4573, + "mean_token_accuracy": 0.36099839210510254, + "num_tokens": 3195030630.0, + "step": 6249 + }, + { + "epoch": 1.6901027582477015, + "grad_norm": 2.46875, + "learning_rate": 0.015882453914552556, + "loss": 3.3006, + "mean_token_accuracy": 0.3847063183784485, + "num_tokens": 3195506195.0, + "step": 6250 + }, + { + "epoch": 1.6903731746890212, + "grad_norm": 3.59375, + "learning_rate": 0.015881129495539575, + "loss": 9.7563, + "mean_token_accuracy": 0.008052281104028225, + "num_tokens": 3196030455.0, + "step": 6251 + }, + { + "epoch": 1.6906435911303408, + "grad_norm": 7.625, + "learning_rate": 0.015879804926759195, + "loss": 3.8314, + "mean_token_accuracy": 0.2982993423938751, + "num_tokens": 3196554663.0, + "step": 6252 + }, + { + "epoch": 1.6909140075716604, + "grad_norm": 2.125, + "learning_rate": 0.015878480208252048, + "loss": 3.3514, + "mean_token_accuracy": 0.35761404037475586, + "num_tokens": 3197078898.0, + "step": 6253 + }, + { + "epoch": 1.69118442401298, + "grad_norm": 2.453125, + "learning_rate": 0.015877155340058777, + "loss": 3.4343, + "mean_token_accuracy": 0.3858054280281067, + "num_tokens": 3197597365.0, + "step": 6254 + }, + { + "epoch": 1.6914548404542997, + "grad_norm": 3.453125, + "learning_rate": 0.01587583032222004, + "loss": 3.4293, + "mean_token_accuracy": 0.3423053026199341, + "num_tokens": 3198121612.0, + "step": 6255 + }, + { + "epoch": 1.6917252568956194, + "grad_norm": 3.265625, + "learning_rate": 0.015874505154776487, + "loss": 3.2861, + "mean_token_accuracy": 0.3777371644973755, + "num_tokens": 3198645857.0, + "step": 6256 + }, + { + "epoch": 1.691995673336939, + "grad_norm": 3.09375, + "learning_rate": 0.015873179837768783, + "loss": 3.0747, + "mean_token_accuracy": 0.3919219374656677, + "num_tokens": 3199169879.0, + "step": 6257 + }, + { + "epoch": 1.6922660897782587, + "grad_norm": 3.171875, + "learning_rate": 0.01587185437123759, + "loss": 3.3641, + "mean_token_accuracy": 0.3686118721961975, + "num_tokens": 3199694104.0, + "step": 6258 + }, + { + "epoch": 1.6925365062195783, + "grad_norm": 2.90625, + "learning_rate": 0.01587052875522358, + "loss": 3.1971, + "mean_token_accuracy": 0.37307101488113403, + "num_tokens": 3200218196.0, + "step": 6259 + }, + { + "epoch": 1.692806922660898, + "grad_norm": 2.359375, + "learning_rate": 0.015869202989767423, + "loss": 3.3567, + "mean_token_accuracy": 0.37537968158721924, + "num_tokens": 3200742394.0, + "step": 6260 + }, + { + "epoch": 1.6930773391022174, + "grad_norm": 3.578125, + "learning_rate": 0.015867877074909796, + "loss": 3.2143, + "mean_token_accuracy": 0.38846075534820557, + "num_tokens": 3201266607.0, + "step": 6261 + }, + { + "epoch": 1.693347755543537, + "grad_norm": 3.28125, + "learning_rate": 0.015866551010691392, + "loss": 3.2526, + "mean_token_accuracy": 0.3723219037055969, + "num_tokens": 3201790811.0, + "step": 6262 + }, + { + "epoch": 1.6936181719848566, + "grad_norm": 3.109375, + "learning_rate": 0.015865224797152886, + "loss": 3.2746, + "mean_token_accuracy": 0.3847671151161194, + "num_tokens": 3202314967.0, + "step": 6263 + }, + { + "epoch": 1.6938885884261763, + "grad_norm": 2.546875, + "learning_rate": 0.015863898434334972, + "loss": 3.243, + "mean_token_accuracy": 0.38263699412345886, + "num_tokens": 3202839187.0, + "step": 6264 + }, + { + "epoch": 1.694159004867496, + "grad_norm": 2.46875, + "learning_rate": 0.015862571922278357, + "loss": 3.2083, + "mean_token_accuracy": 0.4000439941883087, + "num_tokens": 3203363460.0, + "step": 6265 + }, + { + "epoch": 1.6944294213088156, + "grad_norm": 2.703125, + "learning_rate": 0.015861245261023733, + "loss": 3.2093, + "mean_token_accuracy": 0.3985461890697479, + "num_tokens": 3203850856.0, + "step": 6266 + }, + { + "epoch": 1.6946998377501352, + "grad_norm": 3.125, + "learning_rate": 0.01585991845061181, + "loss": 3.3894, + "mean_token_accuracy": 0.3856934905052185, + "num_tokens": 3204297956.0, + "step": 6267 + }, + { + "epoch": 1.6949702541914549, + "grad_norm": 2.921875, + "learning_rate": 0.015858591491083297, + "loss": 3.3044, + "mean_token_accuracy": 0.38137829303741455, + "num_tokens": 3204822135.0, + "step": 6268 + }, + { + "epoch": 1.6952406706327745, + "grad_norm": 3.75, + "learning_rate": 0.015857264382478906, + "loss": 3.456, + "mean_token_accuracy": 0.37225162982940674, + "num_tokens": 3205305170.0, + "step": 6269 + }, + { + "epoch": 1.6955110870740941, + "grad_norm": 3.25, + "learning_rate": 0.01585593712483936, + "loss": 3.3455, + "mean_token_accuracy": 0.38494402170181274, + "num_tokens": 3205829230.0, + "step": 6270 + }, + { + "epoch": 1.6957815035154138, + "grad_norm": 78.0, + "learning_rate": 0.015854609718205383, + "loss": 10.9846, + "mean_token_accuracy": 0.009720283560454845, + "num_tokens": 3206326361.0, + "step": 6271 + }, + { + "epoch": 1.6960519199567332, + "grad_norm": 8.0, + "learning_rate": 0.015853282162617704, + "loss": 3.5648, + "mean_token_accuracy": 0.3168277144432068, + "num_tokens": 3206850574.0, + "step": 6272 + }, + { + "epoch": 1.6963223363980529, + "grad_norm": 2.421875, + "learning_rate": 0.015851954458117056, + "loss": 3.3337, + "mean_token_accuracy": 0.38801300525665283, + "num_tokens": 3207310973.0, + "step": 6273 + }, + { + "epoch": 1.6965927528393725, + "grad_norm": 2.359375, + "learning_rate": 0.015850626604744174, + "loss": 3.3551, + "mean_token_accuracy": 0.36723777651786804, + "num_tokens": 3207835213.0, + "step": 6274 + }, + { + "epoch": 1.6968631692806921, + "grad_norm": 4.34375, + "learning_rate": 0.015849298602539807, + "loss": 3.3587, + "mean_token_accuracy": 0.356872022151947, + "num_tokens": 3208340649.0, + "step": 6275 + }, + { + "epoch": 1.6971335857220118, + "grad_norm": 2.75, + "learning_rate": 0.015847970451544697, + "loss": 3.2095, + "mean_token_accuracy": 0.3982931971549988, + "num_tokens": 3208864918.0, + "step": 6276 + }, + { + "epoch": 1.6974040021633314, + "grad_norm": 3.359375, + "learning_rate": 0.015846642151799595, + "loss": 3.3179, + "mean_token_accuracy": 0.37884628772735596, + "num_tokens": 3209389198.0, + "step": 6277 + }, + { + "epoch": 1.697674418604651, + "grad_norm": 3.0, + "learning_rate": 0.01584531370334526, + "loss": 3.0458, + "mean_token_accuracy": 0.4091516137123108, + "num_tokens": 3209913460.0, + "step": 6278 + }, + { + "epoch": 1.6979448350459707, + "grad_norm": 3.703125, + "learning_rate": 0.01584398510622245, + "loss": 3.2781, + "mean_token_accuracy": 0.3676656484603882, + "num_tokens": 3210437680.0, + "step": 6279 + }, + { + "epoch": 1.6982152514872904, + "grad_norm": 2.90625, + "learning_rate": 0.015842656360471934, + "loss": 3.3715, + "mean_token_accuracy": 0.3675393760204315, + "num_tokens": 3210954049.0, + "step": 6280 + }, + { + "epoch": 1.69848566792861, + "grad_norm": 3.125, + "learning_rate": 0.015841327466134477, + "loss": 3.1574, + "mean_token_accuracy": 0.4131650924682617, + "num_tokens": 3211434729.0, + "step": 6281 + }, + { + "epoch": 1.6987560843699296, + "grad_norm": 2.515625, + "learning_rate": 0.015839998423250855, + "loss": 3.2247, + "mean_token_accuracy": 0.39158254861831665, + "num_tokens": 3211925333.0, + "step": 6282 + }, + { + "epoch": 1.6990265008112493, + "grad_norm": 2.8125, + "learning_rate": 0.015838669231861852, + "loss": 3.2314, + "mean_token_accuracy": 0.39252063632011414, + "num_tokens": 3212449569.0, + "step": 6283 + }, + { + "epoch": 1.699296917252569, + "grad_norm": 2.75, + "learning_rate": 0.015837339892008244, + "loss": 3.3141, + "mean_token_accuracy": 0.3871980905532837, + "num_tokens": 3212973745.0, + "step": 6284 + }, + { + "epoch": 1.6995673336938886, + "grad_norm": 2.21875, + "learning_rate": 0.015836010403730824, + "loss": 3.0482, + "mean_token_accuracy": 0.40301066637039185, + "num_tokens": 3213459119.0, + "step": 6285 + }, + { + "epoch": 1.6998377501352082, + "grad_norm": 2.984375, + "learning_rate": 0.015834680767070383, + "loss": 3.2197, + "mean_token_accuracy": 0.38394781947135925, + "num_tokens": 3213958091.0, + "step": 6286 + }, + { + "epoch": 1.7001081665765279, + "grad_norm": 2.984375, + "learning_rate": 0.01583335098206772, + "loss": 3.1881, + "mean_token_accuracy": 0.3692322373390198, + "num_tokens": 3214482362.0, + "step": 6287 + }, + { + "epoch": 1.7003785830178475, + "grad_norm": 2.609375, + "learning_rate": 0.01583202104876363, + "loss": 3.252, + "mean_token_accuracy": 0.3920246958732605, + "num_tokens": 3215006559.0, + "step": 6288 + }, + { + "epoch": 1.7006489994591671, + "grad_norm": 2.953125, + "learning_rate": 0.015830690967198927, + "loss": 3.3383, + "mean_token_accuracy": 0.3778604567050934, + "num_tokens": 3215530838.0, + "step": 6289 + }, + { + "epoch": 1.7009194159004868, + "grad_norm": 2.65625, + "learning_rate": 0.01582936073741442, + "loss": 3.1367, + "mean_token_accuracy": 0.3976413309574127, + "num_tokens": 3215999237.0, + "step": 6290 + }, + { + "epoch": 1.7011898323418064, + "grad_norm": 43.0, + "learning_rate": 0.01582803035945092, + "loss": 11.6688, + "mean_token_accuracy": 0.026453930884599686, + "num_tokens": 3216523479.0, + "step": 6291 + }, + { + "epoch": 1.701460248783126, + "grad_norm": 7.46875, + "learning_rate": 0.015826699833349254, + "loss": 3.6591, + "mean_token_accuracy": 0.3322320580482483, + "num_tokens": 3217047725.0, + "step": 6292 + }, + { + "epoch": 1.7017306652244457, + "grad_norm": 2.28125, + "learning_rate": 0.015825369159150244, + "loss": 3.2735, + "mean_token_accuracy": 0.3810710310935974, + "num_tokens": 3217571880.0, + "step": 6293 + }, + { + "epoch": 1.7020010816657654, + "grad_norm": 3.15625, + "learning_rate": 0.015824038336894712, + "loss": 3.0493, + "mean_token_accuracy": 0.3830903470516205, + "num_tokens": 3218096084.0, + "step": 6294 + }, + { + "epoch": 1.702271498107085, + "grad_norm": 3.046875, + "learning_rate": 0.015822707366623497, + "loss": 3.1264, + "mean_token_accuracy": 0.39570480585098267, + "num_tokens": 3218620182.0, + "step": 6295 + }, + { + "epoch": 1.7025419145484046, + "grad_norm": 2.984375, + "learning_rate": 0.015821376248377445, + "loss": 3.1364, + "mean_token_accuracy": 0.37421298027038574, + "num_tokens": 3219144308.0, + "step": 6296 + }, + { + "epoch": 1.7028123309897243, + "grad_norm": 3.890625, + "learning_rate": 0.015820044982197384, + "loss": 3.325, + "mean_token_accuracy": 0.3561558425426483, + "num_tokens": 3219668364.0, + "step": 6297 + }, + { + "epoch": 1.703082747431044, + "grad_norm": 3.984375, + "learning_rate": 0.01581871356812417, + "loss": 3.0396, + "mean_token_accuracy": 0.37007880210876465, + "num_tokens": 3220192614.0, + "step": 6298 + }, + { + "epoch": 1.7033531638723636, + "grad_norm": 2.1875, + "learning_rate": 0.015817382006198658, + "loss": 3.356, + "mean_token_accuracy": 0.37181442975997925, + "num_tokens": 3220716887.0, + "step": 6299 + }, + { + "epoch": 1.7036235803136832, + "grad_norm": 2.390625, + "learning_rate": 0.015816050296461694, + "loss": 3.247, + "mean_token_accuracy": 0.3710329830646515, + "num_tokens": 3221192356.0, + "step": 6300 + }, + { + "epoch": 1.7038939967550029, + "grad_norm": 2.015625, + "learning_rate": 0.015814718438954142, + "loss": 3.4061, + "mean_token_accuracy": 0.37702447175979614, + "num_tokens": 3221716533.0, + "step": 6301 + }, + { + "epoch": 1.7041644131963225, + "grad_norm": 2.734375, + "learning_rate": 0.015813386433716874, + "loss": 3.1428, + "mean_token_accuracy": 0.38687437772750854, + "num_tokens": 3222240780.0, + "step": 6302 + }, + { + "epoch": 1.704434829637642, + "grad_norm": 3.703125, + "learning_rate": 0.015812054280790754, + "loss": 3.3821, + "mean_token_accuracy": 0.3919825851917267, + "num_tokens": 3222709655.0, + "step": 6303 + }, + { + "epoch": 1.7047052460789616, + "grad_norm": 4.4375, + "learning_rate": 0.01581072198021665, + "loss": 3.3891, + "mean_token_accuracy": 0.36001187562942505, + "num_tokens": 3223233915.0, + "step": 6304 + }, + { + "epoch": 1.7049756625202812, + "grad_norm": 3.140625, + "learning_rate": 0.015809389532035457, + "loss": 3.2884, + "mean_token_accuracy": 0.3916729688644409, + "num_tokens": 3223757997.0, + "step": 6305 + }, + { + "epoch": 1.7052460789616009, + "grad_norm": 4.125, + "learning_rate": 0.015808056936288048, + "loss": 3.4042, + "mean_token_accuracy": 0.37970417737960815, + "num_tokens": 3224282111.0, + "step": 6306 + }, + { + "epoch": 1.7055164954029205, + "grad_norm": 2.609375, + "learning_rate": 0.01580672419301531, + "loss": 3.2836, + "mean_token_accuracy": 0.3863637447357178, + "num_tokens": 3224806372.0, + "step": 6307 + }, + { + "epoch": 1.7057869118442401, + "grad_norm": 18.5, + "learning_rate": 0.01580539130225814, + "loss": 3.1021, + "mean_token_accuracy": 0.3868246078491211, + "num_tokens": 3225330540.0, + "step": 6308 + }, + { + "epoch": 1.7060573282855598, + "grad_norm": 3.796875, + "learning_rate": 0.01580405826405743, + "loss": 3.4648, + "mean_token_accuracy": 0.34296053647994995, + "num_tokens": 3225854655.0, + "step": 6309 + }, + { + "epoch": 1.7063277447268794, + "grad_norm": 1.9453125, + "learning_rate": 0.01580272507845409, + "loss": 3.2938, + "mean_token_accuracy": 0.38367384672164917, + "num_tokens": 3226378917.0, + "step": 6310 + }, + { + "epoch": 1.706598161168199, + "grad_norm": 50.0, + "learning_rate": 0.01580139174548902, + "loss": 12.4788, + "mean_token_accuracy": 0.010425560176372528, + "num_tokens": 3226903018.0, + "step": 6311 + }, + { + "epoch": 1.7068685776095187, + "grad_norm": 8.0, + "learning_rate": 0.015800058265203126, + "loss": 3.7708, + "mean_token_accuracy": 0.3045521378517151, + "num_tokens": 3227368301.0, + "step": 6312 + }, + { + "epoch": 1.7071389940508381, + "grad_norm": 2.65625, + "learning_rate": 0.015798724637637332, + "loss": 3.251, + "mean_token_accuracy": 0.37328869104385376, + "num_tokens": 3227892514.0, + "step": 6313 + }, + { + "epoch": 1.7074094104921578, + "grad_norm": 2.53125, + "learning_rate": 0.015797390862832553, + "loss": 3.2541, + "mean_token_accuracy": 0.37149107456207275, + "num_tokens": 3228416702.0, + "step": 6314 + }, + { + "epoch": 1.7076798269334774, + "grad_norm": 2.53125, + "learning_rate": 0.015796056940829716, + "loss": 3.2716, + "mean_token_accuracy": 0.38742953538894653, + "num_tokens": 3228890891.0, + "step": 6315 + }, + { + "epoch": 1.707950243374797, + "grad_norm": 2.78125, + "learning_rate": 0.015794722871669747, + "loss": 3.4034, + "mean_token_accuracy": 0.3755590319633484, + "num_tokens": 3229415000.0, + "step": 6316 + }, + { + "epoch": 1.7082206598161167, + "grad_norm": 3.265625, + "learning_rate": 0.015793388655393578, + "loss": 3.3713, + "mean_token_accuracy": 0.4025074243545532, + "num_tokens": 3229907968.0, + "step": 6317 + }, + { + "epoch": 1.7084910762574363, + "grad_norm": 4.15625, + "learning_rate": 0.015792054292042147, + "loss": 3.3957, + "mean_token_accuracy": 0.37889641523361206, + "num_tokens": 3230352249.0, + "step": 6318 + }, + { + "epoch": 1.708761492698756, + "grad_norm": 2.421875, + "learning_rate": 0.015790719781656404, + "loss": 3.3515, + "mean_token_accuracy": 0.36604613065719604, + "num_tokens": 3230876325.0, + "step": 6319 + }, + { + "epoch": 1.7090319091400756, + "grad_norm": 3.0, + "learning_rate": 0.015789385124277286, + "loss": 3.2574, + "mean_token_accuracy": 0.3677574098110199, + "num_tokens": 3231400424.0, + "step": 6320 + }, + { + "epoch": 1.7093023255813953, + "grad_norm": 2.578125, + "learning_rate": 0.015788050319945745, + "loss": 3.3494, + "mean_token_accuracy": 0.3896673321723938, + "num_tokens": 3231924646.0, + "step": 6321 + }, + { + "epoch": 1.709572742022715, + "grad_norm": 3.09375, + "learning_rate": 0.015786715368702745, + "loss": 3.3324, + "mean_token_accuracy": 0.38772356510162354, + "num_tokens": 3232448879.0, + "step": 6322 + }, + { + "epoch": 1.7098431584640346, + "grad_norm": 2.953125, + "learning_rate": 0.015785380270589237, + "loss": 3.3678, + "mean_token_accuracy": 0.37230193614959717, + "num_tokens": 3232972972.0, + "step": 6323 + }, + { + "epoch": 1.7101135749053542, + "grad_norm": 2.546875, + "learning_rate": 0.01578404502564619, + "loss": 3.1646, + "mean_token_accuracy": 0.3930407762527466, + "num_tokens": 3233497119.0, + "step": 6324 + }, + { + "epoch": 1.7103839913466738, + "grad_norm": 3.25, + "learning_rate": 0.015782709633914573, + "loss": 3.2925, + "mean_token_accuracy": 0.3823741674423218, + "num_tokens": 3234021288.0, + "step": 6325 + }, + { + "epoch": 1.7106544077879935, + "grad_norm": 2.78125, + "learning_rate": 0.01578137409543536, + "loss": 3.2506, + "mean_token_accuracy": 0.3807101547718048, + "num_tokens": 3234493159.0, + "step": 6326 + }, + { + "epoch": 1.7109248242293131, + "grad_norm": 3.109375, + "learning_rate": 0.01578003841024953, + "loss": 2.9414, + "mean_token_accuracy": 0.37697499990463257, + "num_tokens": 3234972381.0, + "step": 6327 + }, + { + "epoch": 1.7111952406706328, + "grad_norm": 2.25, + "learning_rate": 0.015778702578398064, + "loss": 3.2852, + "mean_token_accuracy": 0.3852176070213318, + "num_tokens": 3235496603.0, + "step": 6328 + }, + { + "epoch": 1.7114656571119524, + "grad_norm": 3.875, + "learning_rate": 0.015777366599921946, + "loss": 3.3783, + "mean_token_accuracy": 0.3658374845981598, + "num_tokens": 3236020772.0, + "step": 6329 + }, + { + "epoch": 1.711736073553272, + "grad_norm": 2.734375, + "learning_rate": 0.015776030474862174, + "loss": 3.2717, + "mean_token_accuracy": 0.4173696041107178, + "num_tokens": 3236489753.0, + "step": 6330 + }, + { + "epoch": 1.7120064899945917, + "grad_norm": 48.75, + "learning_rate": 0.01577469420325974, + "loss": 11.3144, + "mean_token_accuracy": 0.012767154723405838, + "num_tokens": 3237013993.0, + "step": 6331 + }, + { + "epoch": 1.7122769064359114, + "grad_norm": 6.28125, + "learning_rate": 0.015773357785155647, + "loss": 3.7639, + "mean_token_accuracy": 0.3288377523422241, + "num_tokens": 3237538211.0, + "step": 6332 + }, + { + "epoch": 1.712547322877231, + "grad_norm": 2.171875, + "learning_rate": 0.0157720212205909, + "loss": 3.5063, + "mean_token_accuracy": 0.36658984422683716, + "num_tokens": 3238062480.0, + "step": 6333 + }, + { + "epoch": 1.7128177393185506, + "grad_norm": 2.78125, + "learning_rate": 0.01577068450960651, + "loss": 3.3338, + "mean_token_accuracy": 0.3982802629470825, + "num_tokens": 3238586725.0, + "step": 6334 + }, + { + "epoch": 1.7130881557598703, + "grad_norm": 3.265625, + "learning_rate": 0.015769347652243488, + "loss": 3.4, + "mean_token_accuracy": 0.3854183554649353, + "num_tokens": 3239081600.0, + "step": 6335 + }, + { + "epoch": 1.71335857220119, + "grad_norm": 3.046875, + "learning_rate": 0.015768010648542854, + "loss": 3.2921, + "mean_token_accuracy": 0.4091447591781616, + "num_tokens": 3239542029.0, + "step": 6336 + }, + { + "epoch": 1.7136289886425096, + "grad_norm": 3.390625, + "learning_rate": 0.015766673498545634, + "loss": 3.3117, + "mean_token_accuracy": 0.3955352306365967, + "num_tokens": 3240008585.0, + "step": 6337 + }, + { + "epoch": 1.7138994050838292, + "grad_norm": 2.59375, + "learning_rate": 0.01576533620229285, + "loss": 3.2868, + "mean_token_accuracy": 0.3855976462364197, + "num_tokens": 3240501507.0, + "step": 6338 + }, + { + "epoch": 1.7141698215251489, + "grad_norm": 3.296875, + "learning_rate": 0.015763998759825537, + "loss": 3.3713, + "mean_token_accuracy": 0.37214386463165283, + "num_tokens": 3241025740.0, + "step": 6339 + }, + { + "epoch": 1.7144402379664685, + "grad_norm": 3.0, + "learning_rate": 0.015762661171184737, + "loss": 3.2821, + "mean_token_accuracy": 0.37743625044822693, + "num_tokens": 3241549910.0, + "step": 6340 + }, + { + "epoch": 1.7147106544077881, + "grad_norm": 3.453125, + "learning_rate": 0.01576132343641148, + "loss": 3.2809, + "mean_token_accuracy": 0.37289851903915405, + "num_tokens": 3242074181.0, + "step": 6341 + }, + { + "epoch": 1.7149810708491078, + "grad_norm": 2.765625, + "learning_rate": 0.01575998555554682, + "loss": 3.1936, + "mean_token_accuracy": 0.39262527227401733, + "num_tokens": 3242598324.0, + "step": 6342 + }, + { + "epoch": 1.7152514872904274, + "grad_norm": 3.40625, + "learning_rate": 0.015758647528631806, + "loss": 3.225, + "mean_token_accuracy": 0.37975409626960754, + "num_tokens": 3243098589.0, + "step": 6343 + }, + { + "epoch": 1.7155219037317468, + "grad_norm": 2.125, + "learning_rate": 0.015757309355707492, + "loss": 3.1947, + "mean_token_accuracy": 0.39318686723709106, + "num_tokens": 3243622870.0, + "step": 6344 + }, + { + "epoch": 1.7157923201730665, + "grad_norm": 3.171875, + "learning_rate": 0.015755971036814937, + "loss": 3.1837, + "mean_token_accuracy": 0.4096319377422333, + "num_tokens": 3244147064.0, + "step": 6345 + }, + { + "epoch": 1.7160627366143861, + "grad_norm": 2.609375, + "learning_rate": 0.015754632571995202, + "loss": 3.3476, + "mean_token_accuracy": 0.40567195415496826, + "num_tokens": 3244611224.0, + "step": 6346 + }, + { + "epoch": 1.7163331530557058, + "grad_norm": 3.34375, + "learning_rate": 0.015753293961289355, + "loss": 3.4274, + "mean_token_accuracy": 0.36515501141548157, + "num_tokens": 3245135424.0, + "step": 6347 + }, + { + "epoch": 1.7166035694970254, + "grad_norm": 2.34375, + "learning_rate": 0.015751955204738478, + "loss": 3.139, + "mean_token_accuracy": 0.3888853192329407, + "num_tokens": 3245659605.0, + "step": 6348 + }, + { + "epoch": 1.716873985938345, + "grad_norm": 2.953125, + "learning_rate": 0.015750616302383634, + "loss": 3.1109, + "mean_token_accuracy": 0.38800951838493347, + "num_tokens": 3246183796.0, + "step": 6349 + }, + { + "epoch": 1.7171444023796647, + "grad_norm": 3.25, + "learning_rate": 0.015749277254265913, + "loss": 3.3878, + "mean_token_accuracy": 0.3835670053958893, + "num_tokens": 3246652175.0, + "step": 6350 + }, + { + "epoch": 1.7174148188209843, + "grad_norm": 51.5, + "learning_rate": 0.015747938060426403, + "loss": 12.7683, + "mean_token_accuracy": 0.009286642074584961, + "num_tokens": 3247176402.0, + "step": 6351 + }, + { + "epoch": 1.717685235262304, + "grad_norm": 7.125, + "learning_rate": 0.01574659872090619, + "loss": 3.8777, + "mean_token_accuracy": 0.3106288015842438, + "num_tokens": 3247700675.0, + "step": 6352 + }, + { + "epoch": 1.7179556517036236, + "grad_norm": 2.46875, + "learning_rate": 0.015745259235746364, + "loss": 3.2863, + "mean_token_accuracy": 0.3808540105819702, + "num_tokens": 3248224886.0, + "step": 6353 + }, + { + "epoch": 1.718226068144943, + "grad_norm": 2.703125, + "learning_rate": 0.015743919604988034, + "loss": 3.0385, + "mean_token_accuracy": 0.40412014722824097, + "num_tokens": 3248749069.0, + "step": 6354 + }, + { + "epoch": 1.7184964845862627, + "grad_norm": 3.8125, + "learning_rate": 0.015742579828672297, + "loss": 3.2586, + "mean_token_accuracy": 0.40718406438827515, + "num_tokens": 3249273284.0, + "step": 6355 + }, + { + "epoch": 1.7187669010275823, + "grad_norm": 3.359375, + "learning_rate": 0.015741239906840267, + "loss": 3.3025, + "mean_token_accuracy": 0.3692382276058197, + "num_tokens": 3249797285.0, + "step": 6356 + }, + { + "epoch": 1.719037317468902, + "grad_norm": 3.046875, + "learning_rate": 0.01573989983953305, + "loss": 3.0614, + "mean_token_accuracy": 0.39644843339920044, + "num_tokens": 3250321407.0, + "step": 6357 + }, + { + "epoch": 1.7193077339102216, + "grad_norm": 3.703125, + "learning_rate": 0.015738559626791767, + "loss": 3.2821, + "mean_token_accuracy": 0.3793869614601135, + "num_tokens": 3250845602.0, + "step": 6358 + }, + { + "epoch": 1.7195781503515413, + "grad_norm": 2.40625, + "learning_rate": 0.015737219268657544, + "loss": 3.3248, + "mean_token_accuracy": 0.3697507381439209, + "num_tokens": 3251369728.0, + "step": 6359 + }, + { + "epoch": 1.719848566792861, + "grad_norm": 3.359375, + "learning_rate": 0.015735878765171497, + "loss": 3.2714, + "mean_token_accuracy": 0.35697805881500244, + "num_tokens": 3251894005.0, + "step": 6360 + }, + { + "epoch": 1.7201189832341806, + "grad_norm": 3.09375, + "learning_rate": 0.015734538116374767, + "loss": 3.4661, + "mean_token_accuracy": 0.4005248546600342, + "num_tokens": 3252354680.0, + "step": 6361 + }, + { + "epoch": 1.7203893996755002, + "grad_norm": 3.21875, + "learning_rate": 0.015733197322308483, + "loss": 3.3201, + "mean_token_accuracy": 0.3902454972267151, + "num_tokens": 3252866154.0, + "step": 6362 + }, + { + "epoch": 1.7206598161168198, + "grad_norm": 2.359375, + "learning_rate": 0.015731856383013783, + "loss": 3.3177, + "mean_token_accuracy": 0.39846712350845337, + "num_tokens": 3253338927.0, + "step": 6363 + }, + { + "epoch": 1.7209302325581395, + "grad_norm": 2.96875, + "learning_rate": 0.015730515298531814, + "loss": 3.4271, + "mean_token_accuracy": 0.3597615957260132, + "num_tokens": 3253863113.0, + "step": 6364 + }, + { + "epoch": 1.7212006489994591, + "grad_norm": 2.8125, + "learning_rate": 0.015729174068903725, + "loss": 3.321, + "mean_token_accuracy": 0.3886452615261078, + "num_tokens": 3254374161.0, + "step": 6365 + }, + { + "epoch": 1.7214710654407788, + "grad_norm": 2.6875, + "learning_rate": 0.01572783269417067, + "loss": 3.3729, + "mean_token_accuracy": 0.37731504440307617, + "num_tokens": 3254898344.0, + "step": 6366 + }, + { + "epoch": 1.7217414818820984, + "grad_norm": 3.671875, + "learning_rate": 0.0157264911743738, + "loss": 3.4301, + "mean_token_accuracy": 0.3758620619773865, + "num_tokens": 3255422618.0, + "step": 6367 + }, + { + "epoch": 1.722011898323418, + "grad_norm": 3.21875, + "learning_rate": 0.015725149509554285, + "loss": 3.2534, + "mean_token_accuracy": 0.38737595081329346, + "num_tokens": 3255946823.0, + "step": 6368 + }, + { + "epoch": 1.7222823147647377, + "grad_norm": 3.390625, + "learning_rate": 0.015723807699753286, + "loss": 3.285, + "mean_token_accuracy": 0.3819606304168701, + "num_tokens": 3256426584.0, + "step": 6369 + }, + { + "epoch": 1.7225527312060573, + "grad_norm": 2.546875, + "learning_rate": 0.015722465745011974, + "loss": 3.3751, + "mean_token_accuracy": 0.3706364631652832, + "num_tokens": 3256929113.0, + "step": 6370 + }, + { + "epoch": 1.722823147647377, + "grad_norm": 114.0, + "learning_rate": 0.015721123645371527, + "loss": 25.4518, + "mean_token_accuracy": 1.6791143934824504e-05, + "num_tokens": 3257453289.0, + "step": 6371 + }, + { + "epoch": 1.7230935640886966, + "grad_norm": 6.25, + "learning_rate": 0.01571978140087312, + "loss": 3.6394, + "mean_token_accuracy": 0.31847620010375977, + "num_tokens": 3257977556.0, + "step": 6372 + }, + { + "epoch": 1.7233639805300163, + "grad_norm": 1.96875, + "learning_rate": 0.015718439011557943, + "loss": 3.3995, + "mean_token_accuracy": 0.3772510886192322, + "num_tokens": 3258463459.0, + "step": 6373 + }, + { + "epoch": 1.723634396971336, + "grad_norm": 2.234375, + "learning_rate": 0.01571709647746718, + "loss": 3.1971, + "mean_token_accuracy": 0.38895177841186523, + "num_tokens": 3258987643.0, + "step": 6374 + }, + { + "epoch": 1.7239048134126556, + "grad_norm": 3.078125, + "learning_rate": 0.015715753798642023, + "loss": 3.0951, + "mean_token_accuracy": 0.35989969968795776, + "num_tokens": 3259511888.0, + "step": 6375 + }, + { + "epoch": 1.7241752298539752, + "grad_norm": 2.453125, + "learning_rate": 0.01571441097512367, + "loss": 3.0616, + "mean_token_accuracy": 0.4248882830142975, + "num_tokens": 3260036149.0, + "step": 6376 + }, + { + "epoch": 1.7244456462952948, + "grad_norm": 2.921875, + "learning_rate": 0.01571306800695333, + "loss": 3.3494, + "mean_token_accuracy": 0.39082595705986023, + "num_tokens": 3260560383.0, + "step": 6377 + }, + { + "epoch": 1.7247160627366145, + "grad_norm": 3.25, + "learning_rate": 0.015711724894172198, + "loss": 3.2951, + "mean_token_accuracy": 0.3773704171180725, + "num_tokens": 3261084656.0, + "step": 6378 + }, + { + "epoch": 1.7249864791779341, + "grad_norm": 3.375, + "learning_rate": 0.015710381636821494, + "loss": 3.2779, + "mean_token_accuracy": 0.39567506313323975, + "num_tokens": 3261608934.0, + "step": 6379 + }, + { + "epoch": 1.7252568956192538, + "grad_norm": 3.078125, + "learning_rate": 0.015709038234942428, + "loss": 3.3603, + "mean_token_accuracy": 0.36717069149017334, + "num_tokens": 3262133057.0, + "step": 6380 + }, + { + "epoch": 1.7255273120605734, + "grad_norm": 2.8125, + "learning_rate": 0.01570769468857622, + "loss": 3.135, + "mean_token_accuracy": 0.40759047865867615, + "num_tokens": 3262612063.0, + "step": 6381 + }, + { + "epoch": 1.725797728501893, + "grad_norm": 2.890625, + "learning_rate": 0.01570635099776409, + "loss": 3.26, + "mean_token_accuracy": 0.3954768776893616, + "num_tokens": 3263136227.0, + "step": 6382 + }, + { + "epoch": 1.7260681449432127, + "grad_norm": 3.640625, + "learning_rate": 0.015705007162547278, + "loss": 3.1959, + "mean_token_accuracy": 0.39214128255844116, + "num_tokens": 3263603359.0, + "step": 6383 + }, + { + "epoch": 1.7263385613845323, + "grad_norm": 2.828125, + "learning_rate": 0.015703663182967006, + "loss": 3.4495, + "mean_token_accuracy": 0.38413965702056885, + "num_tokens": 3264127565.0, + "step": 6384 + }, + { + "epoch": 1.7266089778258518, + "grad_norm": 3.5, + "learning_rate": 0.015702319059064516, + "loss": 3.1447, + "mean_token_accuracy": 0.37787431478500366, + "num_tokens": 3264602400.0, + "step": 6385 + }, + { + "epoch": 1.7268793942671714, + "grad_norm": 2.234375, + "learning_rate": 0.01570097479088105, + "loss": 3.2864, + "mean_token_accuracy": 0.41689348220825195, + "num_tokens": 3265053135.0, + "step": 6386 + }, + { + "epoch": 1.727149810708491, + "grad_norm": 3.9375, + "learning_rate": 0.01569963037845785, + "loss": 3.3272, + "mean_token_accuracy": 0.3825835585594177, + "num_tokens": 3265556146.0, + "step": 6387 + }, + { + "epoch": 1.7274202271498107, + "grad_norm": 2.28125, + "learning_rate": 0.015698285821836175, + "loss": 3.0415, + "mean_token_accuracy": 0.4085060954093933, + "num_tokens": 3266080396.0, + "step": 6388 + }, + { + "epoch": 1.7276906435911303, + "grad_norm": 4.0625, + "learning_rate": 0.015696941121057267, + "loss": 3.2678, + "mean_token_accuracy": 0.380053848028183, + "num_tokens": 3266586447.0, + "step": 6389 + }, + { + "epoch": 1.72796106003245, + "grad_norm": 3.1875, + "learning_rate": 0.01569559627616239, + "loss": 3.3752, + "mean_token_accuracy": 0.387983500957489, + "num_tokens": 3267110687.0, + "step": 6390 + }, + { + "epoch": 1.7282314764737696, + "grad_norm": 1.5, + "learning_rate": 0.01569425128719282, + "loss": 11.1138, + "mean_token_accuracy": 1.5228656593535561e-05, + "num_tokens": 3267628779.0, + "step": 6391 + }, + { + "epoch": 1.7285018929150893, + "grad_norm": 6.40625, + "learning_rate": 0.01569290615418981, + "loss": 3.8185, + "mean_token_accuracy": 0.322762131690979, + "num_tokens": 3268153054.0, + "step": 6392 + }, + { + "epoch": 1.728772309356409, + "grad_norm": 2.078125, + "learning_rate": 0.01569156087719464, + "loss": 3.3804, + "mean_token_accuracy": 0.4100843369960785, + "num_tokens": 3268613855.0, + "step": 6393 + }, + { + "epoch": 1.7290427257977286, + "grad_norm": 2.953125, + "learning_rate": 0.015690215456248588, + "loss": 3.2538, + "mean_token_accuracy": 0.38793307542800903, + "num_tokens": 3269138107.0, + "step": 6394 + }, + { + "epoch": 1.729313142239048, + "grad_norm": 4.53125, + "learning_rate": 0.01568886989139293, + "loss": 3.449, + "mean_token_accuracy": 0.3497084379196167, + "num_tokens": 3269627720.0, + "step": 6395 + }, + { + "epoch": 1.7295835586803676, + "grad_norm": 4.875, + "learning_rate": 0.015687524182668954, + "loss": 2.8183, + "mean_token_accuracy": 0.4454677700996399, + "num_tokens": 3270073923.0, + "step": 6396 + }, + { + "epoch": 1.7298539751216873, + "grad_norm": 1.8671875, + "learning_rate": 0.01568617833011795, + "loss": 3.1543, + "mean_token_accuracy": 0.4011945128440857, + "num_tokens": 3270598087.0, + "step": 6397 + }, + { + "epoch": 1.730124391563007, + "grad_norm": 3.21875, + "learning_rate": 0.015684832333781217, + "loss": 3.6299, + "mean_token_accuracy": 0.33362877368927, + "num_tokens": 3271122211.0, + "step": 6398 + }, + { + "epoch": 1.7303948080043265, + "grad_norm": 2.75, + "learning_rate": 0.015683486193700046, + "loss": 3.2439, + "mean_token_accuracy": 0.4039151072502136, + "num_tokens": 3271611635.0, + "step": 6399 + }, + { + "epoch": 1.7306652244456462, + "grad_norm": 2.78125, + "learning_rate": 0.015682139909915748, + "loss": 3.5141, + "mean_token_accuracy": 0.3554721176624298, + "num_tokens": 3272135914.0, + "step": 6400 + }, + { + "epoch": 1.7309356408869658, + "grad_norm": 15.5, + "learning_rate": 0.015680793482469632, + "loss": 3.2107, + "mean_token_accuracy": 0.42152175307273865, + "num_tokens": 3272660041.0, + "step": 6401 + }, + { + "epoch": 1.7312060573282855, + "grad_norm": 3.40625, + "learning_rate": 0.015679446911403005, + "loss": 3.5619, + "mean_token_accuracy": 0.3565114438533783, + "num_tokens": 3273184291.0, + "step": 6402 + }, + { + "epoch": 1.7314764737696051, + "grad_norm": 2.5, + "learning_rate": 0.01567810019675718, + "loss": 3.2833, + "mean_token_accuracy": 0.38799232244491577, + "num_tokens": 3273670610.0, + "step": 6403 + }, + { + "epoch": 1.7317468902109248, + "grad_norm": 3.375, + "learning_rate": 0.015676753338573487, + "loss": 3.1229, + "mean_token_accuracy": 0.39409586787223816, + "num_tokens": 3274194762.0, + "step": 6404 + }, + { + "epoch": 1.7320173066522444, + "grad_norm": 2.28125, + "learning_rate": 0.01567540633689325, + "loss": 3.3201, + "mean_token_accuracy": 0.3748871684074402, + "num_tokens": 3274718992.0, + "step": 6405 + }, + { + "epoch": 1.732287723093564, + "grad_norm": 3.59375, + "learning_rate": 0.01567405919175779, + "loss": 3.4767, + "mean_token_accuracy": 0.3622976541519165, + "num_tokens": 3275243214.0, + "step": 6406 + }, + { + "epoch": 1.7325581395348837, + "grad_norm": 2.96875, + "learning_rate": 0.015672711903208455, + "loss": 3.2594, + "mean_token_accuracy": 0.3874320089817047, + "num_tokens": 3275735558.0, + "step": 6407 + }, + { + "epoch": 1.7328285559762033, + "grad_norm": 3.328125, + "learning_rate": 0.01567136447128658, + "loss": 3.4738, + "mean_token_accuracy": 0.38840657472610474, + "num_tokens": 3276198655.0, + "step": 6408 + }, + { + "epoch": 1.733098972417523, + "grad_norm": 2.59375, + "learning_rate": 0.0156700168960335, + "loss": 3.2035, + "mean_token_accuracy": 0.40032264590263367, + "num_tokens": 3276722799.0, + "step": 6409 + }, + { + "epoch": 1.7333693888588426, + "grad_norm": 2.96875, + "learning_rate": 0.015668669177490567, + "loss": 3.3069, + "mean_token_accuracy": 0.3934454917907715, + "num_tokens": 3277213117.0, + "step": 6410 + }, + { + "epoch": 1.7336398053001623, + "grad_norm": 22.375, + "learning_rate": 0.01566732131569914, + "loss": 11.8124, + "mean_token_accuracy": 9.693173342384398e-05, + "num_tokens": 3277720150.0, + "step": 6411 + }, + { + "epoch": 1.733910221741482, + "grad_norm": 6.4375, + "learning_rate": 0.015665973310700562, + "loss": 3.7389, + "mean_token_accuracy": 0.32063043117523193, + "num_tokens": 3278244337.0, + "step": 6412 + }, + { + "epoch": 1.7341806381828015, + "grad_norm": 2.5, + "learning_rate": 0.015664625162536205, + "loss": 3.1681, + "mean_token_accuracy": 0.34145689010620117, + "num_tokens": 3278768507.0, + "step": 6413 + }, + { + "epoch": 1.7344510546241212, + "grad_norm": 3.359375, + "learning_rate": 0.015663276871247432, + "loss": 3.0138, + "mean_token_accuracy": 0.39270633459091187, + "num_tokens": 3279292656.0, + "step": 6414 + }, + { + "epoch": 1.7347214710654408, + "grad_norm": 3.71875, + "learning_rate": 0.01566192843687561, + "loss": 3.408, + "mean_token_accuracy": 0.38929176330566406, + "num_tokens": 3279730527.0, + "step": 6415 + }, + { + "epoch": 1.7349918875067605, + "grad_norm": 2.765625, + "learning_rate": 0.015660579859462113, + "loss": 3.1178, + "mean_token_accuracy": 0.3903507888317108, + "num_tokens": 3280254720.0, + "step": 6416 + }, + { + "epoch": 1.7352623039480801, + "grad_norm": 3.265625, + "learning_rate": 0.015659231139048317, + "loss": 3.2747, + "mean_token_accuracy": 0.39543604850769043, + "num_tokens": 3280747321.0, + "step": 6417 + }, + { + "epoch": 1.7355327203893998, + "grad_norm": 2.890625, + "learning_rate": 0.015657882275675612, + "loss": 3.3042, + "mean_token_accuracy": 0.39650028944015503, + "num_tokens": 3281169363.0, + "step": 6418 + }, + { + "epoch": 1.7358031368307194, + "grad_norm": 2.671875, + "learning_rate": 0.01565653326938538, + "loss": 3.3826, + "mean_token_accuracy": 0.3793582618236542, + "num_tokens": 3281693497.0, + "step": 6419 + }, + { + "epoch": 1.736073553272039, + "grad_norm": 2.953125, + "learning_rate": 0.01565518412021901, + "loss": 3.3798, + "mean_token_accuracy": 0.3617047667503357, + "num_tokens": 3282217760.0, + "step": 6420 + }, + { + "epoch": 1.7363439697133587, + "grad_norm": 2.84375, + "learning_rate": 0.015653834828217906, + "loss": 3.4068, + "mean_token_accuracy": 0.3776581585407257, + "num_tokens": 3282732144.0, + "step": 6421 + }, + { + "epoch": 1.7366143861546783, + "grad_norm": 2.5625, + "learning_rate": 0.015652485393423458, + "loss": 3.1381, + "mean_token_accuracy": 0.3694031238555908, + "num_tokens": 3283256276.0, + "step": 6422 + }, + { + "epoch": 1.736884802595998, + "grad_norm": 3.015625, + "learning_rate": 0.01565113581587708, + "loss": 3.3359, + "mean_token_accuracy": 0.38060930371284485, + "num_tokens": 3283780560.0, + "step": 6423 + }, + { + "epoch": 1.7371552190373176, + "grad_norm": 3.609375, + "learning_rate": 0.01564978609562018, + "loss": 3.1342, + "mean_token_accuracy": 0.3952587842941284, + "num_tokens": 3284240578.0, + "step": 6424 + }, + { + "epoch": 1.7374256354786373, + "grad_norm": 2.765625, + "learning_rate": 0.015648436232694164, + "loss": 3.0967, + "mean_token_accuracy": 0.38669461011886597, + "num_tokens": 3284764830.0, + "step": 6425 + }, + { + "epoch": 1.7376960519199567, + "grad_norm": 2.890625, + "learning_rate": 0.015647086227140453, + "loss": 3.2593, + "mean_token_accuracy": 0.37457510828971863, + "num_tokens": 3285288934.0, + "step": 6426 + }, + { + "epoch": 1.7379664683612763, + "grad_norm": 3.1875, + "learning_rate": 0.015645736079000473, + "loss": 3.2192, + "mean_token_accuracy": 0.3878045082092285, + "num_tokens": 3285762598.0, + "step": 6427 + }, + { + "epoch": 1.738236884802596, + "grad_norm": 2.65625, + "learning_rate": 0.01564438578831565, + "loss": 3.5639, + "mean_token_accuracy": 0.3589962124824524, + "num_tokens": 3286286840.0, + "step": 6428 + }, + { + "epoch": 1.7385073012439156, + "grad_norm": 3.390625, + "learning_rate": 0.01564303535512741, + "loss": 3.4817, + "mean_token_accuracy": 0.35940027236938477, + "num_tokens": 3286784492.0, + "step": 6429 + }, + { + "epoch": 1.7387777176852353, + "grad_norm": 2.640625, + "learning_rate": 0.01564168477947719, + "loss": 3.0378, + "mean_token_accuracy": 0.38497811555862427, + "num_tokens": 3287308618.0, + "step": 6430 + }, + { + "epoch": 1.739048134126555, + "grad_norm": 48.75, + "learning_rate": 0.01564033406140643, + "loss": 12.2929, + "mean_token_accuracy": 0.01212233304977417, + "num_tokens": 3287832787.0, + "step": 6431 + }, + { + "epoch": 1.7393185505678745, + "grad_norm": 8.125, + "learning_rate": 0.015638983200956576, + "loss": 3.9039, + "mean_token_accuracy": 0.3076935410499573, + "num_tokens": 3288357009.0, + "step": 6432 + }, + { + "epoch": 1.7395889670091942, + "grad_norm": 2.890625, + "learning_rate": 0.015637632198169077, + "loss": 3.3066, + "mean_token_accuracy": 0.40087437629699707, + "num_tokens": 3288881173.0, + "step": 6433 + }, + { + "epoch": 1.7398593834505138, + "grad_norm": 2.75, + "learning_rate": 0.01563628105308538, + "loss": 3.2799, + "mean_token_accuracy": 0.38334083557128906, + "num_tokens": 3289342212.0, + "step": 6434 + }, + { + "epoch": 1.7401297998918335, + "grad_norm": 3.859375, + "learning_rate": 0.01563492976574695, + "loss": 3.2265, + "mean_token_accuracy": 0.4010303020477295, + "num_tokens": 3289743035.0, + "step": 6435 + }, + { + "epoch": 1.740400216333153, + "grad_norm": 2.59375, + "learning_rate": 0.01563357833619524, + "loss": 3.254, + "mean_token_accuracy": 0.380500853061676, + "num_tokens": 3290267241.0, + "step": 6436 + }, + { + "epoch": 1.7406706327744725, + "grad_norm": 2.5625, + "learning_rate": 0.015632226764471728, + "loss": 3.5054, + "mean_token_accuracy": 0.3625847399234772, + "num_tokens": 3290785401.0, + "step": 6437 + }, + { + "epoch": 1.7409410492157922, + "grad_norm": 2.859375, + "learning_rate": 0.01563087505061787, + "loss": 3.2787, + "mean_token_accuracy": 0.38135868310928345, + "num_tokens": 3291309638.0, + "step": 6438 + }, + { + "epoch": 1.7412114656571118, + "grad_norm": 2.484375, + "learning_rate": 0.015629523194675147, + "loss": 3.1856, + "mean_token_accuracy": 0.4061853885650635, + "num_tokens": 3291833884.0, + "step": 6439 + }, + { + "epoch": 1.7414818820984315, + "grad_norm": 3.125, + "learning_rate": 0.015628171196685036, + "loss": 3.1211, + "mean_token_accuracy": 0.3927691578865051, + "num_tokens": 3292358157.0, + "step": 6440 + }, + { + "epoch": 1.741752298539751, + "grad_norm": 2.453125, + "learning_rate": 0.015626819056689025, + "loss": 3.2329, + "mean_token_accuracy": 0.39577674865722656, + "num_tokens": 3292882423.0, + "step": 6441 + }, + { + "epoch": 1.7420227149810708, + "grad_norm": 3.140625, + "learning_rate": 0.015625466774728602, + "loss": 3.2598, + "mean_token_accuracy": 0.37118446826934814, + "num_tokens": 3293406561.0, + "step": 6442 + }, + { + "epoch": 1.7422931314223904, + "grad_norm": 2.75, + "learning_rate": 0.015624114350845252, + "loss": 3.3191, + "mean_token_accuracy": 0.3888397514820099, + "num_tokens": 3293868785.0, + "step": 6443 + }, + { + "epoch": 1.74256354786371, + "grad_norm": 2.84375, + "learning_rate": 0.015622761785080477, + "loss": 3.2973, + "mean_token_accuracy": 0.37178468704223633, + "num_tokens": 3294392959.0, + "step": 6444 + }, + { + "epoch": 1.7428339643050297, + "grad_norm": 2.296875, + "learning_rate": 0.015621409077475772, + "loss": 3.185, + "mean_token_accuracy": 0.36420243978500366, + "num_tokens": 3294917216.0, + "step": 6445 + }, + { + "epoch": 1.7431043807463493, + "grad_norm": 2.984375, + "learning_rate": 0.01562005622807265, + "loss": 3.3268, + "mean_token_accuracy": 0.3623445928096771, + "num_tokens": 3295441490.0, + "step": 6446 + }, + { + "epoch": 1.743374797187669, + "grad_norm": 2.421875, + "learning_rate": 0.015618703236912612, + "loss": 3.2589, + "mean_token_accuracy": 0.3825947642326355, + "num_tokens": 3295965608.0, + "step": 6447 + }, + { + "epoch": 1.7436452136289886, + "grad_norm": 2.890625, + "learning_rate": 0.015617350104037175, + "loss": 3.3413, + "mean_token_accuracy": 0.39122357964515686, + "num_tokens": 3296471451.0, + "step": 6448 + }, + { + "epoch": 1.7439156300703083, + "grad_norm": 2.859375, + "learning_rate": 0.015615996829487864, + "loss": 3.1987, + "mean_token_accuracy": 0.3883569836616516, + "num_tokens": 3296964378.0, + "step": 6449 + }, + { + "epoch": 1.744186046511628, + "grad_norm": 2.734375, + "learning_rate": 0.015614643413306194, + "loss": 3.2298, + "mean_token_accuracy": 0.40765902400016785, + "num_tokens": 3297488565.0, + "step": 6450 + }, + { + "epoch": 1.7444564629529475, + "grad_norm": 54.75, + "learning_rate": 0.015613289855533691, + "loss": 14.7681, + "mean_token_accuracy": 0.0001816757139749825, + "num_tokens": 3298012820.0, + "step": 6451 + }, + { + "epoch": 1.7447268793942672, + "grad_norm": 34.0, + "learning_rate": 0.01561193615621189, + "loss": 3.8577, + "mean_token_accuracy": 0.3253364562988281, + "num_tokens": 3298536998.0, + "step": 6452 + }, + { + "epoch": 1.7449972958355868, + "grad_norm": 4.65625, + "learning_rate": 0.015610582315382322, + "loss": 3.6338, + "mean_token_accuracy": 0.33863362669944763, + "num_tokens": 3299044884.0, + "step": 6453 + }, + { + "epoch": 1.7452677122769065, + "grad_norm": 2.765625, + "learning_rate": 0.015609228333086533, + "loss": 3.4917, + "mean_token_accuracy": 0.35745906829833984, + "num_tokens": 3299569158.0, + "step": 6454 + }, + { + "epoch": 1.7455381287182261, + "grad_norm": 3.53125, + "learning_rate": 0.015607874209366063, + "loss": 3.4662, + "mean_token_accuracy": 0.35614803433418274, + "num_tokens": 3300093328.0, + "step": 6455 + }, + { + "epoch": 1.7458085451595458, + "grad_norm": 2.5, + "learning_rate": 0.015606519944262463, + "loss": 3.4752, + "mean_token_accuracy": 0.37189561128616333, + "num_tokens": 3300617496.0, + "step": 6456 + }, + { + "epoch": 1.7460789616008654, + "grad_norm": 3.875, + "learning_rate": 0.015605165537817283, + "loss": 3.4536, + "mean_token_accuracy": 0.346327543258667, + "num_tokens": 3301141774.0, + "step": 6457 + }, + { + "epoch": 1.746349378042185, + "grad_norm": 2.9375, + "learning_rate": 0.01560381099007208, + "loss": 3.6157, + "mean_token_accuracy": 0.3650937080383301, + "num_tokens": 3301652109.0, + "step": 6458 + }, + { + "epoch": 1.7466197944835047, + "grad_norm": 3.8125, + "learning_rate": 0.015602456301068419, + "loss": 3.236, + "mean_token_accuracy": 0.3941781520843506, + "num_tokens": 3302176235.0, + "step": 6459 + }, + { + "epoch": 1.7468902109248243, + "grad_norm": 3.140625, + "learning_rate": 0.015601101470847863, + "loss": 3.3922, + "mean_token_accuracy": 0.3653133511543274, + "num_tokens": 3302700389.0, + "step": 6460 + }, + { + "epoch": 1.747160627366144, + "grad_norm": 3.859375, + "learning_rate": 0.015599746499451983, + "loss": 3.4552, + "mean_token_accuracy": 0.37630462646484375, + "num_tokens": 3303224640.0, + "step": 6461 + }, + { + "epoch": 1.7474310438074636, + "grad_norm": 3.40625, + "learning_rate": 0.01559839138692235, + "loss": 3.2684, + "mean_token_accuracy": 0.3680267333984375, + "num_tokens": 3303748893.0, + "step": 6462 + }, + { + "epoch": 1.7477014602487833, + "grad_norm": 3.078125, + "learning_rate": 0.015597036133300553, + "loss": 3.3078, + "mean_token_accuracy": 0.37146466970443726, + "num_tokens": 3304273005.0, + "step": 6463 + }, + { + "epoch": 1.747971876690103, + "grad_norm": 2.984375, + "learning_rate": 0.015595680738628161, + "loss": 3.172, + "mean_token_accuracy": 0.4052816927433014, + "num_tokens": 3304797212.0, + "step": 6464 + }, + { + "epoch": 1.7482422931314225, + "grad_norm": 2.9375, + "learning_rate": 0.015594325202946775, + "loss": 3.391, + "mean_token_accuracy": 0.36969897150993347, + "num_tokens": 3305321424.0, + "step": 6465 + }, + { + "epoch": 1.7485127095727422, + "grad_norm": 3.890625, + "learning_rate": 0.015592969526297979, + "loss": 3.2644, + "mean_token_accuracy": 0.37747371196746826, + "num_tokens": 3305845696.0, + "step": 6466 + }, + { + "epoch": 1.7487831260140616, + "grad_norm": 2.78125, + "learning_rate": 0.015591613708723369, + "loss": 3.1751, + "mean_token_accuracy": 0.38387173414230347, + "num_tokens": 3306369974.0, + "step": 6467 + }, + { + "epoch": 1.7490535424553812, + "grad_norm": 3.390625, + "learning_rate": 0.015590257750264546, + "loss": 3.5059, + "mean_token_accuracy": 0.38898423314094543, + "num_tokens": 3306839959.0, + "step": 6468 + }, + { + "epoch": 1.749323958896701, + "grad_norm": 2.90625, + "learning_rate": 0.01558890165096312, + "loss": 3.4872, + "mean_token_accuracy": 0.36903077363967896, + "num_tokens": 3307349934.0, + "step": 6469 + }, + { + "epoch": 1.7495943753380205, + "grad_norm": 2.796875, + "learning_rate": 0.015587545410860697, + "loss": 3.3156, + "mean_token_accuracy": 0.4106557369232178, + "num_tokens": 3307811914.0, + "step": 6470 + }, + { + "epoch": 1.7498647917793402, + "grad_norm": 20.875, + "learning_rate": 0.015586189029998887, + "loss": 11.8574, + "mean_token_accuracy": 1.1255063327553216e-05, + "num_tokens": 3308336150.0, + "step": 6471 + }, + { + "epoch": 1.7501352082206598, + "grad_norm": 6.46875, + "learning_rate": 0.015584832508419313, + "loss": 3.6209, + "mean_token_accuracy": 0.3450199365615845, + "num_tokens": 3308800194.0, + "step": 6472 + }, + { + "epoch": 1.7504056246619795, + "grad_norm": 1.7890625, + "learning_rate": 0.015583475846163593, + "loss": 3.4907, + "mean_token_accuracy": 0.3739427328109741, + "num_tokens": 3309324317.0, + "step": 6473 + }, + { + "epoch": 1.750676041103299, + "grad_norm": 2.90625, + "learning_rate": 0.015582119043273352, + "loss": 3.3256, + "mean_token_accuracy": 0.36410319805145264, + "num_tokens": 3309848601.0, + "step": 6474 + }, + { + "epoch": 1.7509464575446188, + "grad_norm": 3.0625, + "learning_rate": 0.015580762099790226, + "loss": 3.2841, + "mean_token_accuracy": 0.4137954115867615, + "num_tokens": 3310307675.0, + "step": 6475 + }, + { + "epoch": 1.7512168739859384, + "grad_norm": 3.328125, + "learning_rate": 0.015579405015755853, + "loss": 3.3721, + "mean_token_accuracy": 0.4050983190536499, + "num_tokens": 3310766833.0, + "step": 6476 + }, + { + "epoch": 1.7514872904272578, + "grad_norm": 2.546875, + "learning_rate": 0.01557804779121186, + "loss": 3.3012, + "mean_token_accuracy": 0.38620030879974365, + "num_tokens": 3311291071.0, + "step": 6477 + }, + { + "epoch": 1.7517577068685775, + "grad_norm": 2.8125, + "learning_rate": 0.015576690426199897, + "loss": 3.0746, + "mean_token_accuracy": 0.3892039656639099, + "num_tokens": 3311815334.0, + "step": 6478 + }, + { + "epoch": 1.752028123309897, + "grad_norm": 2.09375, + "learning_rate": 0.015575332920761615, + "loss": 3.2048, + "mean_token_accuracy": 0.4097934663295746, + "num_tokens": 3312298015.0, + "step": 6479 + }, + { + "epoch": 1.7522985397512167, + "grad_norm": 2.75, + "learning_rate": 0.015573975274938668, + "loss": 3.3446, + "mean_token_accuracy": 0.3766717314720154, + "num_tokens": 3312792023.0, + "step": 6480 + }, + { + "epoch": 1.7525689561925364, + "grad_norm": 2.40625, + "learning_rate": 0.015572617488772702, + "loss": 3.3258, + "mean_token_accuracy": 0.38958489894866943, + "num_tokens": 3313316222.0, + "step": 6481 + }, + { + "epoch": 1.752839372633856, + "grad_norm": 2.4375, + "learning_rate": 0.015571259562305385, + "loss": 3.2181, + "mean_token_accuracy": 0.3783990740776062, + "num_tokens": 3313840314.0, + "step": 6482 + }, + { + "epoch": 1.7531097890751757, + "grad_norm": 3.015625, + "learning_rate": 0.015569901495578384, + "loss": 3.035, + "mean_token_accuracy": 0.39800962805747986, + "num_tokens": 3314329334.0, + "step": 6483 + }, + { + "epoch": 1.7533802055164953, + "grad_norm": 3.09375, + "learning_rate": 0.015568543288633363, + "loss": 3.1148, + "mean_token_accuracy": 0.3785054683685303, + "num_tokens": 3314853523.0, + "step": 6484 + }, + { + "epoch": 1.753650621957815, + "grad_norm": 2.890625, + "learning_rate": 0.015567184941512, + "loss": 3.1456, + "mean_token_accuracy": 0.40664827823638916, + "num_tokens": 3315319045.0, + "step": 6485 + }, + { + "epoch": 1.7539210383991346, + "grad_norm": 3.546875, + "learning_rate": 0.015565826454255976, + "loss": 3.3485, + "mean_token_accuracy": 0.37777143716812134, + "num_tokens": 3315843295.0, + "step": 6486 + }, + { + "epoch": 1.7541914548404542, + "grad_norm": 2.875, + "learning_rate": 0.015564467826906965, + "loss": 3.1178, + "mean_token_accuracy": 0.3915213942527771, + "num_tokens": 3316367490.0, + "step": 6487 + }, + { + "epoch": 1.7544618712817739, + "grad_norm": 3.640625, + "learning_rate": 0.015563109059506659, + "loss": 3.1537, + "mean_token_accuracy": 0.400115430355072, + "num_tokens": 3316891763.0, + "step": 6488 + }, + { + "epoch": 1.7547322877230935, + "grad_norm": 2.625, + "learning_rate": 0.015561750152096746, + "loss": 3.3964, + "mean_token_accuracy": 0.33817625045776367, + "num_tokens": 3317390103.0, + "step": 6489 + }, + { + "epoch": 1.7550027041644132, + "grad_norm": 2.9375, + "learning_rate": 0.015560391104718924, + "loss": 3.4216, + "mean_token_accuracy": 0.3851550817489624, + "num_tokens": 3317914339.0, + "step": 6490 + }, + { + "epoch": 1.7552731206057328, + "grad_norm": 77.0, + "learning_rate": 0.01555903191741489, + "loss": 10.4846, + "mean_token_accuracy": 0.014235038310289383, + "num_tokens": 3318438618.0, + "step": 6491 + }, + { + "epoch": 1.7555435370470525, + "grad_norm": 9.75, + "learning_rate": 0.015557672590226352, + "loss": 3.9136, + "mean_token_accuracy": 0.3127519488334656, + "num_tokens": 3318962885.0, + "step": 6492 + }, + { + "epoch": 1.755813953488372, + "grad_norm": 2.921875, + "learning_rate": 0.015556313123195015, + "loss": 3.6133, + "mean_token_accuracy": 0.3511953353881836, + "num_tokens": 3319487136.0, + "step": 6493 + }, + { + "epoch": 1.7560843699296917, + "grad_norm": 3.203125, + "learning_rate": 0.015554953516362593, + "loss": 3.4217, + "mean_token_accuracy": 0.40524184703826904, + "num_tokens": 3319948882.0, + "step": 6494 + }, + { + "epoch": 1.7563547863710114, + "grad_norm": 3.828125, + "learning_rate": 0.015553593769770802, + "loss": 3.6388, + "mean_token_accuracy": 0.34927892684936523, + "num_tokens": 3320473157.0, + "step": 6495 + }, + { + "epoch": 1.756625202812331, + "grad_norm": 4.03125, + "learning_rate": 0.015552233883461362, + "loss": 3.0404, + "mean_token_accuracy": 0.4108332395553589, + "num_tokens": 3320973221.0, + "step": 6496 + }, + { + "epoch": 1.7568956192536507, + "grad_norm": 2.703125, + "learning_rate": 0.015550873857475998, + "loss": 3.0594, + "mean_token_accuracy": 0.4035925269126892, + "num_tokens": 3321497482.0, + "step": 6497 + }, + { + "epoch": 1.7571660356949703, + "grad_norm": 2.8125, + "learning_rate": 0.015549513691856441, + "loss": 3.4021, + "mean_token_accuracy": 0.3578922152519226, + "num_tokens": 3322021627.0, + "step": 6498 + }, + { + "epoch": 1.75743645213629, + "grad_norm": 2.515625, + "learning_rate": 0.015548153386644429, + "loss": 3.186, + "mean_token_accuracy": 0.3908519744873047, + "num_tokens": 3322545754.0, + "step": 6499 + }, + { + "epoch": 1.7577068685776096, + "grad_norm": 2.96875, + "learning_rate": 0.015546792941881691, + "loss": 3.2704, + "mean_token_accuracy": 0.38549819588661194, + "num_tokens": 3323070019.0, + "step": 6500 + }, + { + "epoch": 1.7579772850189292, + "grad_norm": 3.484375, + "learning_rate": 0.015545432357609976, + "loss": 3.2793, + "mean_token_accuracy": 0.3978022336959839, + "num_tokens": 3323594152.0, + "step": 6501 + }, + { + "epoch": 1.758247701460249, + "grad_norm": 3.140625, + "learning_rate": 0.015544071633871028, + "loss": 3.311, + "mean_token_accuracy": 0.3602563738822937, + "num_tokens": 3324062995.0, + "step": 6502 + }, + { + "epoch": 1.7585181179015685, + "grad_norm": 3.390625, + "learning_rate": 0.0155427107707066, + "loss": 3.4548, + "mean_token_accuracy": 0.3825731873512268, + "num_tokens": 3324556632.0, + "step": 6503 + }, + { + "epoch": 1.7587885343428882, + "grad_norm": 3.234375, + "learning_rate": 0.015541349768158446, + "loss": 3.1543, + "mean_token_accuracy": 0.40329408645629883, + "num_tokens": 3325080745.0, + "step": 6504 + }, + { + "epoch": 1.7590589507842078, + "grad_norm": 2.984375, + "learning_rate": 0.015539988626268326, + "loss": 3.1427, + "mean_token_accuracy": 0.4114963710308075, + "num_tokens": 3325573697.0, + "step": 6505 + }, + { + "epoch": 1.7593293672255275, + "grad_norm": 4.5625, + "learning_rate": 0.015538627345078005, + "loss": 3.1283, + "mean_token_accuracy": 0.39741891622543335, + "num_tokens": 3326081812.0, + "step": 6506 + }, + { + "epoch": 1.759599783666847, + "grad_norm": 1.75, + "learning_rate": 0.015537265924629246, + "loss": 3.0933, + "mean_token_accuracy": 0.39775195717811584, + "num_tokens": 3326605921.0, + "step": 6507 + }, + { + "epoch": 1.7598702001081665, + "grad_norm": 3.125, + "learning_rate": 0.015535904364963831, + "loss": 3.3031, + "mean_token_accuracy": 0.3903128504753113, + "num_tokens": 3327130178.0, + "step": 6508 + }, + { + "epoch": 1.7601406165494862, + "grad_norm": 2.671875, + "learning_rate": 0.015534542666123526, + "loss": 3.2914, + "mean_token_accuracy": 0.3737215995788574, + "num_tokens": 3327654233.0, + "step": 6509 + }, + { + "epoch": 1.7604110329908058, + "grad_norm": 2.8125, + "learning_rate": 0.015533180828150117, + "loss": 3.3886, + "mean_token_accuracy": 0.37873920798301697, + "num_tokens": 3328178469.0, + "step": 6510 + }, + { + "epoch": 1.7606814494321255, + "grad_norm": 2.34375, + "learning_rate": 0.015531818851085392, + "loss": 11.52, + "mean_token_accuracy": 0.0, + "num_tokens": 3328613666.0, + "step": 6511 + }, + { + "epoch": 1.760951865873445, + "grad_norm": 7.375, + "learning_rate": 0.015530456734971138, + "loss": 3.8926, + "mean_token_accuracy": 0.28915244340896606, + "num_tokens": 3329137940.0, + "step": 6512 + }, + { + "epoch": 1.7612222823147647, + "grad_norm": 2.1875, + "learning_rate": 0.015529094479849147, + "loss": 3.4609, + "mean_token_accuracy": 0.34633031487464905, + "num_tokens": 3329662191.0, + "step": 6513 + }, + { + "epoch": 1.7614926987560844, + "grad_norm": 2.09375, + "learning_rate": 0.015527732085761215, + "loss": 3.1243, + "mean_token_accuracy": 0.3736114203929901, + "num_tokens": 3330186213.0, + "step": 6514 + }, + { + "epoch": 1.761763115197404, + "grad_norm": 2.59375, + "learning_rate": 0.015526369552749148, + "loss": 3.314, + "mean_token_accuracy": 0.38966822624206543, + "num_tokens": 3330710358.0, + "step": 6515 + }, + { + "epoch": 1.7620335316387237, + "grad_norm": 3.09375, + "learning_rate": 0.015525006880854758, + "loss": 3.2878, + "mean_token_accuracy": 0.3672642111778259, + "num_tokens": 3331234513.0, + "step": 6516 + }, + { + "epoch": 1.7623039480800433, + "grad_norm": 3.109375, + "learning_rate": 0.015523644070119842, + "loss": 3.182, + "mean_token_accuracy": 0.38979244232177734, + "num_tokens": 3331758752.0, + "step": 6517 + }, + { + "epoch": 1.7625743645213627, + "grad_norm": 5.1875, + "learning_rate": 0.015522281120586226, + "loss": 2.9246, + "mean_token_accuracy": 0.4215211272239685, + "num_tokens": 3332282753.0, + "step": 6518 + }, + { + "epoch": 1.7628447809626824, + "grad_norm": 1.921875, + "learning_rate": 0.015520918032295728, + "loss": 2.98, + "mean_token_accuracy": 0.43985798954963684, + "num_tokens": 3332806902.0, + "step": 6519 + }, + { + "epoch": 1.763115197404002, + "grad_norm": 2.96875, + "learning_rate": 0.015519554805290166, + "loss": 3.4197, + "mean_token_accuracy": 0.36363160610198975, + "num_tokens": 3333331006.0, + "step": 6520 + }, + { + "epoch": 1.7633856138453217, + "grad_norm": 3.4375, + "learning_rate": 0.015518191439611374, + "loss": 3.5041, + "mean_token_accuracy": 0.3689793348312378, + "num_tokens": 3333855174.0, + "step": 6521 + }, + { + "epoch": 1.7636560302866413, + "grad_norm": 2.875, + "learning_rate": 0.015516827935301182, + "loss": 3.1778, + "mean_token_accuracy": 0.3956282138824463, + "num_tokens": 3334374062.0, + "step": 6522 + }, + { + "epoch": 1.763926446727961, + "grad_norm": 3.015625, + "learning_rate": 0.015515464292401424, + "loss": 3.1842, + "mean_token_accuracy": 0.37054213881492615, + "num_tokens": 3334898317.0, + "step": 6523 + }, + { + "epoch": 1.7641968631692806, + "grad_norm": 2.796875, + "learning_rate": 0.015514100510953938, + "loss": 3.3109, + "mean_token_accuracy": 0.39430493116378784, + "num_tokens": 3335422557.0, + "step": 6524 + }, + { + "epoch": 1.7644672796106002, + "grad_norm": 3.140625, + "learning_rate": 0.015512736591000581, + "loss": 3.3377, + "mean_token_accuracy": 0.3931897282600403, + "num_tokens": 3335946790.0, + "step": 6525 + }, + { + "epoch": 1.7647376960519199, + "grad_norm": 3.21875, + "learning_rate": 0.015511372532583195, + "loss": 3.4976, + "mean_token_accuracy": 0.3839870095252991, + "num_tokens": 3336471003.0, + "step": 6526 + }, + { + "epoch": 1.7650081124932395, + "grad_norm": 3.75, + "learning_rate": 0.015510008335743626, + "loss": 3.0277, + "mean_token_accuracy": 0.4313068091869354, + "num_tokens": 3336935712.0, + "step": 6527 + }, + { + "epoch": 1.7652785289345592, + "grad_norm": 2.25, + "learning_rate": 0.015508644000523741, + "loss": 3.2262, + "mean_token_accuracy": 0.3871060311794281, + "num_tokens": 3337459875.0, + "step": 6528 + }, + { + "epoch": 1.7655489453758788, + "grad_norm": 3.65625, + "learning_rate": 0.0155072795269654, + "loss": 3.2783, + "mean_token_accuracy": 0.3746229410171509, + "num_tokens": 3337955223.0, + "step": 6529 + }, + { + "epoch": 1.7658193618171985, + "grad_norm": 2.234375, + "learning_rate": 0.01550591491511047, + "loss": 3.2694, + "mean_token_accuracy": 0.3783368170261383, + "num_tokens": 3338479398.0, + "step": 6530 + }, + { + "epoch": 1.766089778258518, + "grad_norm": 89.0, + "learning_rate": 0.015504550165000814, + "loss": 29.8028, + "mean_token_accuracy": 3.595397720346227e-05, + "num_tokens": 3339003431.0, + "step": 6531 + }, + { + "epoch": 1.7663601946998377, + "grad_norm": 7.40625, + "learning_rate": 0.015503185276678315, + "loss": 3.8079, + "mean_token_accuracy": 0.3301459848880768, + "num_tokens": 3339486285.0, + "step": 6532 + }, + { + "epoch": 1.7666306111411574, + "grad_norm": 2.5, + "learning_rate": 0.015501820250184846, + "loss": 3.4204, + "mean_token_accuracy": 0.37120452523231506, + "num_tokens": 3340010429.0, + "step": 6533 + }, + { + "epoch": 1.766901027582477, + "grad_norm": 2.546875, + "learning_rate": 0.015500455085562297, + "loss": 3.3076, + "mean_token_accuracy": 0.37314194440841675, + "num_tokens": 3340534637.0, + "step": 6534 + }, + { + "epoch": 1.7671714440237967, + "grad_norm": 2.65625, + "learning_rate": 0.01549908978285255, + "loss": 3.3939, + "mean_token_accuracy": 0.36885860562324524, + "num_tokens": 3341058871.0, + "step": 6535 + }, + { + "epoch": 1.7674418604651163, + "grad_norm": 2.828125, + "learning_rate": 0.015497724342097495, + "loss": 3.404, + "mean_token_accuracy": 0.3569236397743225, + "num_tokens": 3341583075.0, + "step": 6536 + }, + { + "epoch": 1.767712276906436, + "grad_norm": 2.953125, + "learning_rate": 0.015496358763339034, + "loss": 3.2234, + "mean_token_accuracy": 0.41214776039123535, + "num_tokens": 3342107220.0, + "step": 6537 + }, + { + "epoch": 1.7679826933477556, + "grad_norm": 3.25, + "learning_rate": 0.01549499304661906, + "loss": 3.3704, + "mean_token_accuracy": 0.3690846562385559, + "num_tokens": 3342575991.0, + "step": 6538 + }, + { + "epoch": 1.7682531097890752, + "grad_norm": 3.234375, + "learning_rate": 0.01549362719197948, + "loss": 3.1579, + "mean_token_accuracy": 0.3935595154762268, + "num_tokens": 3343100267.0, + "step": 6539 + }, + { + "epoch": 1.7685235262303949, + "grad_norm": 3.046875, + "learning_rate": 0.015492261199462204, + "loss": 3.0963, + "mean_token_accuracy": 0.3888755440711975, + "num_tokens": 3343624468.0, + "step": 6540 + }, + { + "epoch": 1.7687939426717145, + "grad_norm": 2.84375, + "learning_rate": 0.015490895069109143, + "loss": 3.1033, + "mean_token_accuracy": 0.39859122037887573, + "num_tokens": 3344148746.0, + "step": 6541 + }, + { + "epoch": 1.7690643591130342, + "grad_norm": 4.0, + "learning_rate": 0.015489528800962216, + "loss": 3.2263, + "mean_token_accuracy": 0.38285505771636963, + "num_tokens": 3344672981.0, + "step": 6542 + }, + { + "epoch": 1.7693347755543538, + "grad_norm": 3.828125, + "learning_rate": 0.015488162395063343, + "loss": 3.4208, + "mean_token_accuracy": 0.36210066080093384, + "num_tokens": 3345197105.0, + "step": 6543 + }, + { + "epoch": 1.7696051919956735, + "grad_norm": 2.90625, + "learning_rate": 0.015486795851454448, + "loss": 3.2897, + "mean_token_accuracy": 0.3705163896083832, + "num_tokens": 3345721287.0, + "step": 6544 + }, + { + "epoch": 1.769875608436993, + "grad_norm": 2.5625, + "learning_rate": 0.015485429170177461, + "loss": 3.313, + "mean_token_accuracy": 0.3901502788066864, + "num_tokens": 3346245484.0, + "step": 6545 + }, + { + "epoch": 1.7701460248783127, + "grad_norm": 3.328125, + "learning_rate": 0.015484062351274315, + "loss": 3.3741, + "mean_token_accuracy": 0.3750265836715698, + "num_tokens": 3346769674.0, + "step": 6546 + }, + { + "epoch": 1.7704164413196324, + "grad_norm": 2.640625, + "learning_rate": 0.015482695394786949, + "loss": 3.2345, + "mean_token_accuracy": 0.39821064472198486, + "num_tokens": 3347235656.0, + "step": 6547 + }, + { + "epoch": 1.770686857760952, + "grad_norm": 2.703125, + "learning_rate": 0.015481328300757307, + "loss": 3.3525, + "mean_token_accuracy": 0.39514631032943726, + "num_tokens": 3347751340.0, + "step": 6548 + }, + { + "epoch": 1.7709572742022714, + "grad_norm": 3.1875, + "learning_rate": 0.015479961069227334, + "loss": 3.3866, + "mean_token_accuracy": 0.38218456506729126, + "num_tokens": 3348237148.0, + "step": 6549 + }, + { + "epoch": 1.771227690643591, + "grad_norm": 2.65625, + "learning_rate": 0.015478593700238984, + "loss": 3.2435, + "mean_token_accuracy": 0.400259792804718, + "num_tokens": 3348761158.0, + "step": 6550 + }, + { + "epoch": 1.7714981070849107, + "grad_norm": 5.59375, + "learning_rate": 0.015477226193834204, + "loss": 9.0322, + "mean_token_accuracy": 0.029808664694428444, + "num_tokens": 3349267896.0, + "step": 6551 + }, + { + "epoch": 1.7717685235262304, + "grad_norm": 7.15625, + "learning_rate": 0.015475858550054957, + "loss": 3.8334, + "mean_token_accuracy": 0.30375194549560547, + "num_tokens": 3349792113.0, + "step": 6552 + }, + { + "epoch": 1.77203893996755, + "grad_norm": 2.75, + "learning_rate": 0.01547449076894321, + "loss": 3.2724, + "mean_token_accuracy": 0.3834836781024933, + "num_tokens": 3350316226.0, + "step": 6553 + }, + { + "epoch": 1.7723093564088697, + "grad_norm": 3.375, + "learning_rate": 0.015473122850540928, + "loss": 3.5224, + "mean_token_accuracy": 0.38962802290916443, + "num_tokens": 3350781087.0, + "step": 6554 + }, + { + "epoch": 1.7725797728501893, + "grad_norm": 3.1875, + "learning_rate": 0.015471754794890083, + "loss": 3.5171, + "mean_token_accuracy": 0.35739442706108093, + "num_tokens": 3351305336.0, + "step": 6555 + }, + { + "epoch": 1.772850189291509, + "grad_norm": 2.625, + "learning_rate": 0.015470386602032647, + "loss": 3.2413, + "mean_token_accuracy": 0.3862127661705017, + "num_tokens": 3351829600.0, + "step": 6556 + }, + { + "epoch": 1.7731206057328286, + "grad_norm": 3.46875, + "learning_rate": 0.015469018272010606, + "loss": 3.1389, + "mean_token_accuracy": 0.3852102756500244, + "num_tokens": 3352352724.0, + "step": 6557 + }, + { + "epoch": 1.7733910221741482, + "grad_norm": 2.78125, + "learning_rate": 0.015467649804865941, + "loss": 3.1671, + "mean_token_accuracy": 0.387432336807251, + "num_tokens": 3352876989.0, + "step": 6558 + }, + { + "epoch": 1.7736614386154677, + "grad_norm": 3.1875, + "learning_rate": 0.01546628120064064, + "loss": 3.3266, + "mean_token_accuracy": 0.3976200222969055, + "num_tokens": 3353401109.0, + "step": 6559 + }, + { + "epoch": 1.7739318550567873, + "grad_norm": 2.5625, + "learning_rate": 0.015464912459376699, + "loss": 3.2699, + "mean_token_accuracy": 0.37984997034072876, + "num_tokens": 3353925307.0, + "step": 6560 + }, + { + "epoch": 1.774202271498107, + "grad_norm": 2.890625, + "learning_rate": 0.015463543581116112, + "loss": 3.2925, + "mean_token_accuracy": 0.39003777503967285, + "num_tokens": 3354449548.0, + "step": 6561 + }, + { + "epoch": 1.7744726879394266, + "grad_norm": 2.921875, + "learning_rate": 0.015462174565900884, + "loss": 3.3373, + "mean_token_accuracy": 0.4026726186275482, + "num_tokens": 3354908673.0, + "step": 6562 + }, + { + "epoch": 1.7747431043807462, + "grad_norm": 2.546875, + "learning_rate": 0.015460805413773015, + "loss": 3.2488, + "mean_token_accuracy": 0.39812082052230835, + "num_tokens": 3355432876.0, + "step": 6563 + }, + { + "epoch": 1.7750135208220659, + "grad_norm": 2.609375, + "learning_rate": 0.015459436124774517, + "loss": 3.1756, + "mean_token_accuracy": 0.3948288559913635, + "num_tokens": 3355956864.0, + "step": 6564 + }, + { + "epoch": 1.7752839372633855, + "grad_norm": 3.3125, + "learning_rate": 0.01545806669894741, + "loss": 3.1142, + "mean_token_accuracy": 0.40090620517730713, + "num_tokens": 3356419948.0, + "step": 6565 + }, + { + "epoch": 1.7755543537047052, + "grad_norm": 3.359375, + "learning_rate": 0.015456697136333701, + "loss": 2.9604, + "mean_token_accuracy": 0.4069729745388031, + "num_tokens": 3356944179.0, + "step": 6566 + }, + { + "epoch": 1.7758247701460248, + "grad_norm": 3.40625, + "learning_rate": 0.015455327436975419, + "loss": 3.1562, + "mean_token_accuracy": 0.3958964943885803, + "num_tokens": 3357468416.0, + "step": 6567 + }, + { + "epoch": 1.7760951865873444, + "grad_norm": 3.21875, + "learning_rate": 0.01545395760091459, + "loss": 3.18, + "mean_token_accuracy": 0.38583502173423767, + "num_tokens": 3357992596.0, + "step": 6568 + }, + { + "epoch": 1.776365603028664, + "grad_norm": 2.234375, + "learning_rate": 0.015452587628193242, + "loss": 3.3114, + "mean_token_accuracy": 0.3948849141597748, + "num_tokens": 3358516832.0, + "step": 6569 + }, + { + "epoch": 1.7766360194699837, + "grad_norm": 2.71875, + "learning_rate": 0.015451217518853413, + "loss": 3.1926, + "mean_token_accuracy": 0.36227720975875854, + "num_tokens": 3359041023.0, + "step": 6570 + }, + { + "epoch": 1.7769064359113034, + "grad_norm": 11.0, + "learning_rate": 0.015449847272937139, + "loss": 10.7144, + "mean_token_accuracy": 0.011366419494152069, + "num_tokens": 3359557878.0, + "step": 6571 + }, + { + "epoch": 1.777176852352623, + "grad_norm": 6.21875, + "learning_rate": 0.015448476890486465, + "loss": 3.6768, + "mean_token_accuracy": 0.3386228382587433, + "num_tokens": 3360082070.0, + "step": 6572 + }, + { + "epoch": 1.7774472687939427, + "grad_norm": 2.1875, + "learning_rate": 0.015447106371543439, + "loss": 3.4897, + "mean_token_accuracy": 0.37369364500045776, + "num_tokens": 3360571808.0, + "step": 6573 + }, + { + "epoch": 1.7777176852352623, + "grad_norm": 2.578125, + "learning_rate": 0.015445735716150108, + "loss": 3.1754, + "mean_token_accuracy": 0.3919944167137146, + "num_tokens": 3361095998.0, + "step": 6574 + }, + { + "epoch": 1.777988101676582, + "grad_norm": 2.859375, + "learning_rate": 0.015444364924348536, + "loss": 3.2055, + "mean_token_accuracy": 0.37174126505851746, + "num_tokens": 3361620230.0, + "step": 6575 + }, + { + "epoch": 1.7782585181179016, + "grad_norm": 3.234375, + "learning_rate": 0.015442993996180772, + "loss": 3.2974, + "mean_token_accuracy": 0.38317739963531494, + "num_tokens": 3362114642.0, + "step": 6576 + }, + { + "epoch": 1.7785289345592212, + "grad_norm": 3.796875, + "learning_rate": 0.01544162293168889, + "loss": 3.1927, + "mean_token_accuracy": 0.37643012404441833, + "num_tokens": 3362638830.0, + "step": 6577 + }, + { + "epoch": 1.7787993510005409, + "grad_norm": 3.5625, + "learning_rate": 0.015440251730914956, + "loss": 3.2604, + "mean_token_accuracy": 0.37737733125686646, + "num_tokens": 3363163073.0, + "step": 6578 + }, + { + "epoch": 1.7790697674418605, + "grad_norm": 2.75, + "learning_rate": 0.01543888039390104, + "loss": 2.9695, + "mean_token_accuracy": 0.41657358407974243, + "num_tokens": 3363651103.0, + "step": 6579 + }, + { + "epoch": 1.7793401838831802, + "grad_norm": 3.28125, + "learning_rate": 0.015437508920689218, + "loss": 3.1679, + "mean_token_accuracy": 0.38465380668640137, + "num_tokens": 3364174305.0, + "step": 6580 + }, + { + "epoch": 1.7796106003244998, + "grad_norm": 3.296875, + "learning_rate": 0.015436137311321572, + "loss": 3.4199, + "mean_token_accuracy": 0.381003737449646, + "num_tokens": 3364698555.0, + "step": 6581 + }, + { + "epoch": 1.7798810167658194, + "grad_norm": 3.890625, + "learning_rate": 0.01543476556584019, + "loss": 3.2961, + "mean_token_accuracy": 0.38523781299591064, + "num_tokens": 3365222826.0, + "step": 6582 + }, + { + "epoch": 1.780151433207139, + "grad_norm": 2.3125, + "learning_rate": 0.015433393684287157, + "loss": 3.135, + "mean_token_accuracy": 0.3858179450035095, + "num_tokens": 3365747106.0, + "step": 6583 + }, + { + "epoch": 1.7804218496484587, + "grad_norm": 2.71875, + "learning_rate": 0.015432021666704567, + "loss": 3.2092, + "mean_token_accuracy": 0.411030650138855, + "num_tokens": 3366212013.0, + "step": 6584 + }, + { + "epoch": 1.7806922660897784, + "grad_norm": 2.359375, + "learning_rate": 0.015430649513134522, + "loss": 3.3958, + "mean_token_accuracy": 0.3798343539237976, + "num_tokens": 3366736230.0, + "step": 6585 + }, + { + "epoch": 1.780962682531098, + "grad_norm": 3.46875, + "learning_rate": 0.015429277223619118, + "loss": 3.2677, + "mean_token_accuracy": 0.38147473335266113, + "num_tokens": 3367260492.0, + "step": 6586 + }, + { + "epoch": 1.7812330989724177, + "grad_norm": 3.015625, + "learning_rate": 0.01542790479820046, + "loss": 3.3795, + "mean_token_accuracy": 0.38769662380218506, + "num_tokens": 3367738666.0, + "step": 6587 + }, + { + "epoch": 1.7815035154137373, + "grad_norm": 3.09375, + "learning_rate": 0.015426532236920665, + "loss": 3.1672, + "mean_token_accuracy": 0.39704251289367676, + "num_tokens": 3368262911.0, + "step": 6588 + }, + { + "epoch": 1.781773931855057, + "grad_norm": 2.453125, + "learning_rate": 0.015425159539821844, + "loss": 3.1623, + "mean_token_accuracy": 0.3830220103263855, + "num_tokens": 3368787092.0, + "step": 6589 + }, + { + "epoch": 1.7820443482963764, + "grad_norm": 3.078125, + "learning_rate": 0.015423786706946113, + "loss": 3.2264, + "mean_token_accuracy": 0.3972163200378418, + "num_tokens": 3369247844.0, + "step": 6590 + }, + { + "epoch": 1.782314764737696, + "grad_norm": 5.84375, + "learning_rate": 0.015422413738335597, + "loss": 11.4063, + "mean_token_accuracy": 1.872474058473017e-05, + "num_tokens": 3369726127.0, + "step": 6591 + }, + { + "epoch": 1.7825851811790157, + "grad_norm": 8.375, + "learning_rate": 0.015421040634032424, + "loss": 3.5057, + "mean_token_accuracy": 0.41380593180656433, + "num_tokens": 3370247982.0, + "step": 6592 + }, + { + "epoch": 1.7828555976203353, + "grad_norm": 2.1875, + "learning_rate": 0.015419667394078717, + "loss": 3.4125, + "mean_token_accuracy": 0.38288772106170654, + "num_tokens": 3370698769.0, + "step": 6593 + }, + { + "epoch": 1.783126014061655, + "grad_norm": 2.59375, + "learning_rate": 0.015418294018516621, + "loss": 3.1636, + "mean_token_accuracy": 0.3915850520133972, + "num_tokens": 3371223000.0, + "step": 6594 + }, + { + "epoch": 1.7833964305029746, + "grad_norm": 3.078125, + "learning_rate": 0.015416920507388271, + "loss": 3.1303, + "mean_token_accuracy": 0.3894196152687073, + "num_tokens": 3371709487.0, + "step": 6595 + }, + { + "epoch": 1.7836668469442942, + "grad_norm": 2.421875, + "learning_rate": 0.015415546860735811, + "loss": 3.2201, + "mean_token_accuracy": 0.3867136836051941, + "num_tokens": 3372233673.0, + "step": 6596 + }, + { + "epoch": 1.7839372633856139, + "grad_norm": 3.265625, + "learning_rate": 0.015414173078601387, + "loss": 3.17, + "mean_token_accuracy": 0.4038252532482147, + "num_tokens": 3372706265.0, + "step": 6597 + }, + { + "epoch": 1.7842076798269335, + "grad_norm": 2.65625, + "learning_rate": 0.015412799161027153, + "loss": 3.0653, + "mean_token_accuracy": 0.3933189809322357, + "num_tokens": 3373230391.0, + "step": 6598 + }, + { + "epoch": 1.7844780962682532, + "grad_norm": 12.625, + "learning_rate": 0.015411425108055264, + "loss": 3.1239, + "mean_token_accuracy": 0.407324880361557, + "num_tokens": 3373754602.0, + "step": 6599 + }, + { + "epoch": 1.7847485127095726, + "grad_norm": 2.4375, + "learning_rate": 0.015410050919727879, + "loss": 3.3594, + "mean_token_accuracy": 0.3838813900947571, + "num_tokens": 3374278855.0, + "step": 6600 + }, + { + "epoch": 1.7850189291508922, + "grad_norm": 2.375, + "learning_rate": 0.015408676596087162, + "loss": 3.2522, + "mean_token_accuracy": 0.38486602902412415, + "num_tokens": 3374802936.0, + "step": 6601 + }, + { + "epoch": 1.7852893455922119, + "grad_norm": 3.328125, + "learning_rate": 0.01540730213717528, + "loss": 3.1242, + "mean_token_accuracy": 0.40091800689697266, + "num_tokens": 3375327164.0, + "step": 6602 + }, + { + "epoch": 1.7855597620335315, + "grad_norm": 3.25, + "learning_rate": 0.015405927543034409, + "loss": 3.2384, + "mean_token_accuracy": 0.3802527189254761, + "num_tokens": 3375818097.0, + "step": 6603 + }, + { + "epoch": 1.7858301784748511, + "grad_norm": 3.578125, + "learning_rate": 0.015404552813706727, + "loss": 3.3524, + "mean_token_accuracy": 0.3961185812950134, + "num_tokens": 3376263630.0, + "step": 6604 + }, + { + "epoch": 1.7861005949161708, + "grad_norm": 3.546875, + "learning_rate": 0.015403177949234409, + "loss": 3.5284, + "mean_token_accuracy": 0.35651445388793945, + "num_tokens": 3376787808.0, + "step": 6605 + }, + { + "epoch": 1.7863710113574904, + "grad_norm": 2.546875, + "learning_rate": 0.01540180294965964, + "loss": 3.0411, + "mean_token_accuracy": 0.3960535228252411, + "num_tokens": 3377311895.0, + "step": 6606 + }, + { + "epoch": 1.78664142779881, + "grad_norm": 2.546875, + "learning_rate": 0.015400427815024615, + "loss": 3.1756, + "mean_token_accuracy": 0.42615747451782227, + "num_tokens": 3377836169.0, + "step": 6607 + }, + { + "epoch": 1.7869118442401297, + "grad_norm": 3.03125, + "learning_rate": 0.015399052545371524, + "loss": 3.514, + "mean_token_accuracy": 0.363098680973053, + "num_tokens": 3378360409.0, + "step": 6608 + }, + { + "epoch": 1.7871822606814494, + "grad_norm": 2.984375, + "learning_rate": 0.015397677140742564, + "loss": 3.3394, + "mean_token_accuracy": 0.383781760931015, + "num_tokens": 3378884692.0, + "step": 6609 + }, + { + "epoch": 1.787452677122769, + "grad_norm": 3.046875, + "learning_rate": 0.015396301601179933, + "loss": 3.3854, + "mean_token_accuracy": 0.3658126890659332, + "num_tokens": 3379408967.0, + "step": 6610 + }, + { + "epoch": 1.7877230935640886, + "grad_norm": 6.09375, + "learning_rate": 0.015394925926725844, + "loss": 11.0472, + "mean_token_accuracy": 5.20466983289225e-06, + "num_tokens": 3379933216.0, + "step": 6611 + }, + { + "epoch": 1.7879935100054083, + "grad_norm": 7.34375, + "learning_rate": 0.015393550117422501, + "loss": 3.8291, + "mean_token_accuracy": 0.36956578493118286, + "num_tokens": 3380412728.0, + "step": 6612 + }, + { + "epoch": 1.788263926446728, + "grad_norm": 2.03125, + "learning_rate": 0.015392174173312121, + "loss": 3.5238, + "mean_token_accuracy": 0.34970220923423767, + "num_tokens": 3380936999.0, + "step": 6613 + }, + { + "epoch": 1.7885343428880476, + "grad_norm": 3.328125, + "learning_rate": 0.015390798094436922, + "loss": 3.2879, + "mean_token_accuracy": 0.37148672342300415, + "num_tokens": 3381425359.0, + "step": 6614 + }, + { + "epoch": 1.7888047593293672, + "grad_norm": 3.5, + "learning_rate": 0.015389421880839122, + "loss": 3.4125, + "mean_token_accuracy": 0.37407195568084717, + "num_tokens": 3381949637.0, + "step": 6615 + }, + { + "epoch": 1.7890751757706869, + "grad_norm": 2.921875, + "learning_rate": 0.015388045532560953, + "loss": 3.3817, + "mean_token_accuracy": 0.3904184103012085, + "num_tokens": 3382444938.0, + "step": 6616 + }, + { + "epoch": 1.7893455922120065, + "grad_norm": 3.140625, + "learning_rate": 0.015386669049644643, + "loss": 3.1607, + "mean_token_accuracy": 0.3818311393260956, + "num_tokens": 3382969122.0, + "step": 6617 + }, + { + "epoch": 1.7896160086533262, + "grad_norm": 3.40625, + "learning_rate": 0.015385292432132424, + "loss": 3.2878, + "mean_token_accuracy": 0.35855430364608765, + "num_tokens": 3383493042.0, + "step": 6618 + }, + { + "epoch": 1.7898864250946458, + "grad_norm": 3.0, + "learning_rate": 0.015383915680066536, + "loss": 3.4428, + "mean_token_accuracy": 0.38719648122787476, + "num_tokens": 3383996057.0, + "step": 6619 + }, + { + "epoch": 1.7901568415359654, + "grad_norm": 2.640625, + "learning_rate": 0.015382538793489225, + "loss": 3.2967, + "mean_token_accuracy": 0.40157589316368103, + "num_tokens": 3384465992.0, + "step": 6620 + }, + { + "epoch": 1.790427257977285, + "grad_norm": 2.640625, + "learning_rate": 0.015381161772442736, + "loss": 3.347, + "mean_token_accuracy": 0.39237695932388306, + "num_tokens": 3384892050.0, + "step": 6621 + }, + { + "epoch": 1.7906976744186047, + "grad_norm": 3.5625, + "learning_rate": 0.015379784616969318, + "loss": 3.2115, + "mean_token_accuracy": 0.4020671844482422, + "num_tokens": 3385366330.0, + "step": 6622 + }, + { + "epoch": 1.7909680908599244, + "grad_norm": 2.578125, + "learning_rate": 0.015378407327111227, + "loss": 3.1892, + "mean_token_accuracy": 0.40117400884628296, + "num_tokens": 3385869002.0, + "step": 6623 + }, + { + "epoch": 1.791238507301244, + "grad_norm": 3.40625, + "learning_rate": 0.015377029902910726, + "loss": 3.1002, + "mean_token_accuracy": 0.40722212195396423, + "num_tokens": 3386393284.0, + "step": 6624 + }, + { + "epoch": 1.7915089237425637, + "grad_norm": 2.78125, + "learning_rate": 0.015375652344410073, + "loss": 3.2462, + "mean_token_accuracy": 0.39266711473464966, + "num_tokens": 3386917531.0, + "step": 6625 + }, + { + "epoch": 1.7917793401838833, + "grad_norm": 3.328125, + "learning_rate": 0.01537427465165154, + "loss": 3.1942, + "mean_token_accuracy": 0.3821376860141754, + "num_tokens": 3387441547.0, + "step": 6626 + }, + { + "epoch": 1.792049756625203, + "grad_norm": 2.28125, + "learning_rate": 0.015372896824677396, + "loss": 3.2167, + "mean_token_accuracy": 0.3886250853538513, + "num_tokens": 3387965788.0, + "step": 6627 + }, + { + "epoch": 1.7923201730665226, + "grad_norm": 3.421875, + "learning_rate": 0.015371518863529918, + "loss": 3.4421, + "mean_token_accuracy": 0.3667941391468048, + "num_tokens": 3388456326.0, + "step": 6628 + }, + { + "epoch": 1.7925905895078422, + "grad_norm": 3.171875, + "learning_rate": 0.015370140768251384, + "loss": 3.33, + "mean_token_accuracy": 0.3893658220767975, + "num_tokens": 3388973382.0, + "step": 6629 + }, + { + "epoch": 1.7928610059491619, + "grad_norm": 3.1875, + "learning_rate": 0.01536876253888408, + "loss": 3.1995, + "mean_token_accuracy": 0.39263877272605896, + "num_tokens": 3389497539.0, + "step": 6630 + }, + { + "epoch": 1.7931314223904813, + "grad_norm": 31.0, + "learning_rate": 0.015367384175470294, + "loss": 11.4702, + "mean_token_accuracy": 0.009040683507919312, + "num_tokens": 3390021799.0, + "step": 6631 + }, + { + "epoch": 1.793401838831801, + "grad_norm": 7.5, + "learning_rate": 0.015366005678052318, + "loss": 3.672, + "mean_token_accuracy": 0.327008992433548, + "num_tokens": 3390545969.0, + "step": 6632 + }, + { + "epoch": 1.7936722552731206, + "grad_norm": 1.734375, + "learning_rate": 0.015364627046672448, + "loss": 3.3877, + "mean_token_accuracy": 0.39794740080833435, + "num_tokens": 3391008394.0, + "step": 6633 + }, + { + "epoch": 1.7939426717144402, + "grad_norm": 2.234375, + "learning_rate": 0.015363248281372989, + "loss": 3.3268, + "mean_token_accuracy": 0.3679463863372803, + "num_tokens": 3391532677.0, + "step": 6634 + }, + { + "epoch": 1.7942130881557599, + "grad_norm": 2.96875, + "learning_rate": 0.015361869382196235, + "loss": 3.3066, + "mean_token_accuracy": 0.37443798780441284, + "num_tokens": 3392056802.0, + "step": 6635 + }, + { + "epoch": 1.7944835045970795, + "grad_norm": 2.921875, + "learning_rate": 0.015360490349184502, + "loss": 3.352, + "mean_token_accuracy": 0.3695070445537567, + "num_tokens": 3392581056.0, + "step": 6636 + }, + { + "epoch": 1.7947539210383991, + "grad_norm": 3.4375, + "learning_rate": 0.015359111182380104, + "loss": 3.4484, + "mean_token_accuracy": 0.41925719380378723, + "num_tokens": 3393040717.0, + "step": 6637 + }, + { + "epoch": 1.7950243374797188, + "grad_norm": 3.84375, + "learning_rate": 0.015357731881825352, + "loss": 3.2155, + "mean_token_accuracy": 0.36924704909324646, + "num_tokens": 3393527511.0, + "step": 6638 + }, + { + "epoch": 1.7952947539210384, + "grad_norm": 2.734375, + "learning_rate": 0.015356352447562574, + "loss": 3.2798, + "mean_token_accuracy": 0.38332611322402954, + "num_tokens": 3394051623.0, + "step": 6639 + }, + { + "epoch": 1.795565170362358, + "grad_norm": 2.71875, + "learning_rate": 0.015354972879634091, + "loss": 3.266, + "mean_token_accuracy": 0.3819379210472107, + "num_tokens": 3394575776.0, + "step": 6640 + }, + { + "epoch": 1.7958355868036775, + "grad_norm": 2.546875, + "learning_rate": 0.015353593178082232, + "loss": 3.3074, + "mean_token_accuracy": 0.3933058977127075, + "num_tokens": 3395099959.0, + "step": 6641 + }, + { + "epoch": 1.7961060032449971, + "grad_norm": 3.859375, + "learning_rate": 0.015352213342949332, + "loss": 3.02, + "mean_token_accuracy": 0.44003021717071533, + "num_tokens": 3395624219.0, + "step": 6642 + }, + { + "epoch": 1.7963764196863168, + "grad_norm": 1.734375, + "learning_rate": 0.015350833374277728, + "loss": 3.1477, + "mean_token_accuracy": 0.4000946283340454, + "num_tokens": 3396120686.0, + "step": 6643 + }, + { + "epoch": 1.7966468361276364, + "grad_norm": 3.328125, + "learning_rate": 0.015349453272109759, + "loss": 3.3478, + "mean_token_accuracy": 0.3759983777999878, + "num_tokens": 3396644877.0, + "step": 6644 + }, + { + "epoch": 1.796917252568956, + "grad_norm": 2.828125, + "learning_rate": 0.015348073036487773, + "loss": 3.3157, + "mean_token_accuracy": 0.3821478486061096, + "num_tokens": 3397169155.0, + "step": 6645 + }, + { + "epoch": 1.7971876690102757, + "grad_norm": 3.546875, + "learning_rate": 0.01534669266745412, + "loss": 3.073, + "mean_token_accuracy": 0.4003888964653015, + "num_tokens": 3397693257.0, + "step": 6646 + }, + { + "epoch": 1.7974580854515954, + "grad_norm": 3.484375, + "learning_rate": 0.015345312165051157, + "loss": 3.124, + "mean_token_accuracy": 0.3915782570838928, + "num_tokens": 3398211930.0, + "step": 6647 + }, + { + "epoch": 1.797728501892915, + "grad_norm": 2.640625, + "learning_rate": 0.015343931529321236, + "loss": 3.0795, + "mean_token_accuracy": 0.38880646228790283, + "num_tokens": 3398735995.0, + "step": 6648 + }, + { + "epoch": 1.7979989183342346, + "grad_norm": 3.125, + "learning_rate": 0.015342550760306717, + "loss": 3.3426, + "mean_token_accuracy": 0.372191846370697, + "num_tokens": 3399260244.0, + "step": 6649 + }, + { + "epoch": 1.7982693347755543, + "grad_norm": 2.59375, + "learning_rate": 0.015341169858049975, + "loss": 2.9929, + "mean_token_accuracy": 0.3979544937610626, + "num_tokens": 3399784528.0, + "step": 6650 + }, + { + "epoch": 1.798539751216874, + "grad_norm": 3.078125, + "learning_rate": 0.015339788822593375, + "loss": 10.8913, + "mean_token_accuracy": 1.4558616385329515e-05, + "num_tokens": 3400308715.0, + "step": 6651 + }, + { + "epoch": 1.7988101676581936, + "grad_norm": 8.75, + "learning_rate": 0.015338407653979286, + "loss": 3.7602, + "mean_token_accuracy": 0.3263242542743683, + "num_tokens": 3400832934.0, + "step": 6652 + }, + { + "epoch": 1.7990805840995132, + "grad_norm": 2.609375, + "learning_rate": 0.015337026352250094, + "loss": 3.3935, + "mean_token_accuracy": 0.363855242729187, + "num_tokens": 3401357206.0, + "step": 6653 + }, + { + "epoch": 1.7993510005408329, + "grad_norm": 3.359375, + "learning_rate": 0.015335644917448184, + "loss": 3.2261, + "mean_token_accuracy": 0.3784700632095337, + "num_tokens": 3401856628.0, + "step": 6654 + }, + { + "epoch": 1.7996214169821525, + "grad_norm": 3.390625, + "learning_rate": 0.01533426334961593, + "loss": 3.3086, + "mean_token_accuracy": 0.36730003356933594, + "num_tokens": 3402380904.0, + "step": 6655 + }, + { + "epoch": 1.7998918334234721, + "grad_norm": 2.609375, + "learning_rate": 0.015332881648795735, + "loss": 3.2559, + "mean_token_accuracy": 0.40219518542289734, + "num_tokens": 3402841121.0, + "step": 6656 + }, + { + "epoch": 1.8001622498647918, + "grad_norm": 2.96875, + "learning_rate": 0.01533149981502999, + "loss": 3.3811, + "mean_token_accuracy": 0.3810123801231384, + "num_tokens": 3403365210.0, + "step": 6657 + }, + { + "epoch": 1.8004326663061114, + "grad_norm": 2.328125, + "learning_rate": 0.015330117848361092, + "loss": 3.186, + "mean_token_accuracy": 0.4099755585193634, + "num_tokens": 3403828069.0, + "step": 6658 + }, + { + "epoch": 1.800703082747431, + "grad_norm": 2.78125, + "learning_rate": 0.015328735748831441, + "loss": 3.1756, + "mean_token_accuracy": 0.3921881914138794, + "num_tokens": 3404339714.0, + "step": 6659 + }, + { + "epoch": 1.8009734991887507, + "grad_norm": 4.09375, + "learning_rate": 0.015327353516483448, + "loss": 3.4303, + "mean_token_accuracy": 0.40409746766090393, + "num_tokens": 3404856274.0, + "step": 6660 + }, + { + "epoch": 1.8012439156300704, + "grad_norm": 3.59375, + "learning_rate": 0.015325971151359528, + "loss": 3.4154, + "mean_token_accuracy": 0.3654181659221649, + "num_tokens": 3405380561.0, + "step": 6661 + }, + { + "epoch": 1.80151433207139, + "grad_norm": 3.109375, + "learning_rate": 0.015324588653502086, + "loss": 3.1149, + "mean_token_accuracy": 0.3844374120235443, + "num_tokens": 3405904805.0, + "step": 6662 + }, + { + "epoch": 1.8017847485127096, + "grad_norm": 3.171875, + "learning_rate": 0.01532320602295355, + "loss": 3.0605, + "mean_token_accuracy": 0.4189296066761017, + "num_tokens": 3406429074.0, + "step": 6663 + }, + { + "epoch": 1.8020551649540293, + "grad_norm": 2.6875, + "learning_rate": 0.015321823259756338, + "loss": 3.2248, + "mean_token_accuracy": 0.3703223168849945, + "num_tokens": 3406953337.0, + "step": 6664 + }, + { + "epoch": 1.802325581395349, + "grad_norm": 3.328125, + "learning_rate": 0.015320440363952876, + "loss": 3.3382, + "mean_token_accuracy": 0.3673907518386841, + "num_tokens": 3407477590.0, + "step": 6665 + }, + { + "epoch": 1.8025959978366686, + "grad_norm": 2.515625, + "learning_rate": 0.0153190573355856, + "loss": 3.104, + "mean_token_accuracy": 0.39984944462776184, + "num_tokens": 3408001773.0, + "step": 6666 + }, + { + "epoch": 1.8028664142779882, + "grad_norm": 2.984375, + "learning_rate": 0.015317674174696943, + "loss": 3.3416, + "mean_token_accuracy": 0.36682820320129395, + "num_tokens": 3408526043.0, + "step": 6667 + }, + { + "epoch": 1.8031368307193079, + "grad_norm": 2.796875, + "learning_rate": 0.015316290881329344, + "loss": 3.123, + "mean_token_accuracy": 0.38814932107925415, + "num_tokens": 3409050260.0, + "step": 6668 + }, + { + "epoch": 1.8034072471606275, + "grad_norm": 2.640625, + "learning_rate": 0.01531490745552525, + "loss": 3.1943, + "mean_token_accuracy": 0.37963777780532837, + "num_tokens": 3409574521.0, + "step": 6669 + }, + { + "epoch": 1.8036776636019471, + "grad_norm": 2.375, + "learning_rate": 0.015313523897327106, + "loss": 3.1538, + "mean_token_accuracy": 0.38470661640167236, + "num_tokens": 3410098619.0, + "step": 6670 + }, + { + "epoch": 1.8039480800432668, + "grad_norm": 2.90625, + "learning_rate": 0.015312140206777363, + "loss": 9.6125, + "mean_token_accuracy": 0.007827989757061005, + "num_tokens": 3410622874.0, + "step": 6671 + }, + { + "epoch": 1.8042184964845862, + "grad_norm": 7.8125, + "learning_rate": 0.015310756383918479, + "loss": 3.8273, + "mean_token_accuracy": 0.30970993638038635, + "num_tokens": 3411147032.0, + "step": 6672 + }, + { + "epoch": 1.8044889129259059, + "grad_norm": 2.90625, + "learning_rate": 0.015309372428792907, + "loss": 3.2352, + "mean_token_accuracy": 0.37820637226104736, + "num_tokens": 3411671189.0, + "step": 6673 + }, + { + "epoch": 1.8047593293672255, + "grad_norm": 2.65625, + "learning_rate": 0.015307988341443118, + "loss": 3.2756, + "mean_token_accuracy": 0.3892570734024048, + "num_tokens": 3412195302.0, + "step": 6674 + }, + { + "epoch": 1.8050297458085451, + "grad_norm": 3.46875, + "learning_rate": 0.015306604121911578, + "loss": 3.3745, + "mean_token_accuracy": 0.3664248585700989, + "num_tokens": 3412719583.0, + "step": 6675 + }, + { + "epoch": 1.8053001622498648, + "grad_norm": 2.671875, + "learning_rate": 0.015305219770240758, + "loss": 3.2184, + "mean_token_accuracy": 0.3764283061027527, + "num_tokens": 3413182067.0, + "step": 6676 + }, + { + "epoch": 1.8055705786911844, + "grad_norm": 2.921875, + "learning_rate": 0.01530383528647314, + "loss": 3.465, + "mean_token_accuracy": 0.34850746393203735, + "num_tokens": 3413703599.0, + "step": 6677 + }, + { + "epoch": 1.805840995132504, + "grad_norm": 2.59375, + "learning_rate": 0.015302450670651191, + "loss": 3.2575, + "mean_token_accuracy": 0.3889754116535187, + "num_tokens": 3414227609.0, + "step": 6678 + }, + { + "epoch": 1.8061114115738237, + "grad_norm": 2.6875, + "learning_rate": 0.015301065922817408, + "loss": 3.2137, + "mean_token_accuracy": 0.3699812889099121, + "num_tokens": 3414751848.0, + "step": 6679 + }, + { + "epoch": 1.8063818280151434, + "grad_norm": 2.5625, + "learning_rate": 0.01529968104301427, + "loss": 3.3038, + "mean_token_accuracy": 0.4007400870323181, + "num_tokens": 3415275972.0, + "step": 6680 + }, + { + "epoch": 1.806652244456463, + "grad_norm": 3.296875, + "learning_rate": 0.015298296031284275, + "loss": 3.1889, + "mean_token_accuracy": 0.387056827545166, + "num_tokens": 3415766018.0, + "step": 6681 + }, + { + "epoch": 1.8069226608977826, + "grad_norm": 2.640625, + "learning_rate": 0.015296910887669919, + "loss": 3.1503, + "mean_token_accuracy": 0.39369815587997437, + "num_tokens": 3416290215.0, + "step": 6682 + }, + { + "epoch": 1.807193077339102, + "grad_norm": 3.078125, + "learning_rate": 0.015295525612213699, + "loss": 3.078, + "mean_token_accuracy": 0.3982314169406891, + "num_tokens": 3416771053.0, + "step": 6683 + }, + { + "epoch": 1.8074634937804217, + "grad_norm": 3.59375, + "learning_rate": 0.01529414020495812, + "loss": 3.1891, + "mean_token_accuracy": 0.38320159912109375, + "num_tokens": 3417295229.0, + "step": 6684 + }, + { + "epoch": 1.8077339102217413, + "grad_norm": 3.09375, + "learning_rate": 0.015292754665945691, + "loss": 3.0967, + "mean_token_accuracy": 0.3924142122268677, + "num_tokens": 3417819466.0, + "step": 6685 + }, + { + "epoch": 1.808004326663061, + "grad_norm": 3.140625, + "learning_rate": 0.015291368995218928, + "loss": 3.3366, + "mean_token_accuracy": 0.38314545154571533, + "num_tokens": 3418343610.0, + "step": 6686 + }, + { + "epoch": 1.8082747431043806, + "grad_norm": 4.46875, + "learning_rate": 0.01528998319282034, + "loss": 3.4813, + "mean_token_accuracy": 0.3815053105354309, + "num_tokens": 3418822658.0, + "step": 6687 + }, + { + "epoch": 1.8085451595457003, + "grad_norm": 2.890625, + "learning_rate": 0.015288597258792454, + "loss": 3.4313, + "mean_token_accuracy": 0.37459856271743774, + "num_tokens": 3419346867.0, + "step": 6688 + }, + { + "epoch": 1.80881557598702, + "grad_norm": 3.046875, + "learning_rate": 0.01528721119317779, + "loss": 3.1188, + "mean_token_accuracy": 0.37261080741882324, + "num_tokens": 3419871138.0, + "step": 6689 + }, + { + "epoch": 1.8090859924283396, + "grad_norm": 2.578125, + "learning_rate": 0.015285824996018882, + "loss": 3.3068, + "mean_token_accuracy": 0.3927713930606842, + "num_tokens": 3420395406.0, + "step": 6690 + }, + { + "epoch": 1.8093564088696592, + "grad_norm": 308.0, + "learning_rate": 0.015284438667358258, + "loss": 17.9574, + "mean_token_accuracy": 0.0021846387535333633, + "num_tokens": 3420919680.0, + "step": 6691 + }, + { + "epoch": 1.8096268253109788, + "grad_norm": 7.8125, + "learning_rate": 0.015283052207238456, + "loss": 3.8447, + "mean_token_accuracy": 0.3266668915748596, + "num_tokens": 3421408456.0, + "step": 6692 + }, + { + "epoch": 1.8098972417522985, + "grad_norm": 2.421875, + "learning_rate": 0.015281665615702017, + "loss": 3.3662, + "mean_token_accuracy": 0.3730682134628296, + "num_tokens": 3421932684.0, + "step": 6693 + }, + { + "epoch": 1.8101676581936181, + "grad_norm": 3.15625, + "learning_rate": 0.015280278892791484, + "loss": 3.3807, + "mean_token_accuracy": 0.379454642534256, + "num_tokens": 3422456839.0, + "step": 6694 + }, + { + "epoch": 1.8104380746349378, + "grad_norm": 3.125, + "learning_rate": 0.015278892038549406, + "loss": 3.2826, + "mean_token_accuracy": 0.3835354447364807, + "num_tokens": 3422981047.0, + "step": 6695 + }, + { + "epoch": 1.8107084910762574, + "grad_norm": 2.4375, + "learning_rate": 0.015277505053018342, + "loss": 3.4111, + "mean_token_accuracy": 0.3756219744682312, + "num_tokens": 3423505327.0, + "step": 6696 + }, + { + "epoch": 1.810978907517577, + "grad_norm": 3.046875, + "learning_rate": 0.015276117936240839, + "loss": 3.0763, + "mean_token_accuracy": 0.3891534209251404, + "num_tokens": 3424029268.0, + "step": 6697 + }, + { + "epoch": 1.8112493239588967, + "grad_norm": 2.25, + "learning_rate": 0.015274730688259462, + "loss": 3.2169, + "mean_token_accuracy": 0.37611448764801025, + "num_tokens": 3424553444.0, + "step": 6698 + }, + { + "epoch": 1.8115197404002163, + "grad_norm": 2.65625, + "learning_rate": 0.015273343309116779, + "loss": 3.1888, + "mean_token_accuracy": 0.3861587941646576, + "num_tokens": 3425077633.0, + "step": 6699 + }, + { + "epoch": 1.811790156841536, + "grad_norm": 3.203125, + "learning_rate": 0.015271955798855357, + "loss": 3.1033, + "mean_token_accuracy": 0.41512274742126465, + "num_tokens": 3425544375.0, + "step": 6700 + }, + { + "epoch": 1.8120605732828556, + "grad_norm": 3.140625, + "learning_rate": 0.015270568157517766, + "loss": 2.9999, + "mean_token_accuracy": 0.38473230600357056, + "num_tokens": 3426045076.0, + "step": 6701 + }, + { + "epoch": 1.8123309897241753, + "grad_norm": 2.625, + "learning_rate": 0.015269180385146582, + "loss": 3.2103, + "mean_token_accuracy": 0.3881397843360901, + "num_tokens": 3426569349.0, + "step": 6702 + }, + { + "epoch": 1.812601406165495, + "grad_norm": 2.765625, + "learning_rate": 0.015267792481784392, + "loss": 3.1922, + "mean_token_accuracy": 0.381798654794693, + "num_tokens": 3427093624.0, + "step": 6703 + }, + { + "epoch": 1.8128718226068146, + "grad_norm": 3.09375, + "learning_rate": 0.015266404447473781, + "loss": 3.2052, + "mean_token_accuracy": 0.39799177646636963, + "num_tokens": 3427534385.0, + "step": 6704 + }, + { + "epoch": 1.8131422390481342, + "grad_norm": 2.71875, + "learning_rate": 0.01526501628225733, + "loss": 3.1008, + "mean_token_accuracy": 0.3682291805744171, + "num_tokens": 3428058552.0, + "step": 6705 + }, + { + "epoch": 1.8134126554894539, + "grad_norm": 3.390625, + "learning_rate": 0.015263627986177641, + "loss": 3.1389, + "mean_token_accuracy": 0.36735713481903076, + "num_tokens": 3428582705.0, + "step": 6706 + }, + { + "epoch": 1.8136830719307735, + "grad_norm": 2.15625, + "learning_rate": 0.015262239559277304, + "loss": 3.0731, + "mean_token_accuracy": 0.3937913775444031, + "num_tokens": 3429099192.0, + "step": 6707 + }, + { + "epoch": 1.8139534883720931, + "grad_norm": 3.421875, + "learning_rate": 0.015260851001598923, + "loss": 3.1297, + "mean_token_accuracy": 0.39698556065559387, + "num_tokens": 3429623445.0, + "step": 6708 + }, + { + "epoch": 1.8142239048134128, + "grad_norm": 3.015625, + "learning_rate": 0.015259462313185104, + "loss": 3.0503, + "mean_token_accuracy": 0.35703760385513306, + "num_tokens": 3430147551.0, + "step": 6709 + }, + { + "epoch": 1.8144943212547324, + "grad_norm": 3.515625, + "learning_rate": 0.015258073494078456, + "loss": 3.0591, + "mean_token_accuracy": 0.4725882411003113, + "num_tokens": 3430608039.0, + "step": 6710 + }, + { + "epoch": 1.814764737696052, + "grad_norm": 39.5, + "learning_rate": 0.015256684544321589, + "loss": 17.8776, + "mean_token_accuracy": 2.830423682098626e-06, + "num_tokens": 3431070497.0, + "step": 6711 + }, + { + "epoch": 1.8150351541373717, + "grad_norm": 5.8125, + "learning_rate": 0.015255295463957121, + "loss": 3.5637, + "mean_token_accuracy": 0.355208158493042, + "num_tokens": 3431594591.0, + "step": 6712 + }, + { + "epoch": 1.8153055705786911, + "grad_norm": 3.15625, + "learning_rate": 0.015253906253027678, + "loss": 3.3307, + "mean_token_accuracy": 0.36714038252830505, + "num_tokens": 3432102548.0, + "step": 6713 + }, + { + "epoch": 1.8155759870200108, + "grad_norm": 3.234375, + "learning_rate": 0.015252516911575878, + "loss": 3.3413, + "mean_token_accuracy": 0.3899917006492615, + "num_tokens": 3432573386.0, + "step": 6714 + }, + { + "epoch": 1.8158464034613304, + "grad_norm": 2.265625, + "learning_rate": 0.015251127439644354, + "loss": 3.2295, + "mean_token_accuracy": 0.3911563754081726, + "num_tokens": 3433097646.0, + "step": 6715 + }, + { + "epoch": 1.81611681990265, + "grad_norm": 2.359375, + "learning_rate": 0.01524973783727574, + "loss": 3.35, + "mean_token_accuracy": 0.3821913003921509, + "num_tokens": 3433621848.0, + "step": 6716 + }, + { + "epoch": 1.8163872363439697, + "grad_norm": 2.609375, + "learning_rate": 0.015248348104512668, + "loss": 3.1684, + "mean_token_accuracy": 0.41384488344192505, + "num_tokens": 3434140369.0, + "step": 6717 + }, + { + "epoch": 1.8166576527852893, + "grad_norm": 2.59375, + "learning_rate": 0.015246958241397785, + "loss": 3.3369, + "mean_token_accuracy": 0.39011454582214355, + "num_tokens": 3434641158.0, + "step": 6718 + }, + { + "epoch": 1.816928069226609, + "grad_norm": 3.203125, + "learning_rate": 0.015245568247973736, + "loss": 3.205, + "mean_token_accuracy": 0.3948677182197571, + "num_tokens": 3435165239.0, + "step": 6719 + }, + { + "epoch": 1.8171984856679286, + "grad_norm": 2.546875, + "learning_rate": 0.015244178124283166, + "loss": 3.0822, + "mean_token_accuracy": 0.4040226638317108, + "num_tokens": 3435642717.0, + "step": 6720 + }, + { + "epoch": 1.8174689021092483, + "grad_norm": 2.5625, + "learning_rate": 0.015242787870368726, + "loss": 3.0093, + "mean_token_accuracy": 0.4184580445289612, + "num_tokens": 3436143448.0, + "step": 6721 + }, + { + "epoch": 1.817739318550568, + "grad_norm": 3.328125, + "learning_rate": 0.015241397486273083, + "loss": 3.4169, + "mean_token_accuracy": 0.36154478788375854, + "num_tokens": 3436667672.0, + "step": 6722 + }, + { + "epoch": 1.8180097349918876, + "grad_norm": 2.953125, + "learning_rate": 0.015240006972038887, + "loss": 3.1281, + "mean_token_accuracy": 0.4003646969795227, + "num_tokens": 3437191870.0, + "step": 6723 + }, + { + "epoch": 1.818280151433207, + "grad_norm": 3.09375, + "learning_rate": 0.01523861632770881, + "loss": 3.3454, + "mean_token_accuracy": 0.40376758575439453, + "num_tokens": 3437655815.0, + "step": 6724 + }, + { + "epoch": 1.8185505678745266, + "grad_norm": 2.71875, + "learning_rate": 0.015237225553325517, + "loss": 3.2983, + "mean_token_accuracy": 0.3575161099433899, + "num_tokens": 3438179883.0, + "step": 6725 + }, + { + "epoch": 1.8188209843158463, + "grad_norm": 2.421875, + "learning_rate": 0.015235834648931688, + "loss": 3.2046, + "mean_token_accuracy": 0.38924312591552734, + "num_tokens": 3438704063.0, + "step": 6726 + }, + { + "epoch": 1.819091400757166, + "grad_norm": 3.109375, + "learning_rate": 0.015234443614569992, + "loss": 3.2553, + "mean_token_accuracy": 0.3887816071510315, + "num_tokens": 3439228260.0, + "step": 6727 + }, + { + "epoch": 1.8193618171984856, + "grad_norm": 2.5625, + "learning_rate": 0.015233052450283113, + "loss": 3.1481, + "mean_token_accuracy": 0.41624775528907776, + "num_tokens": 3439752428.0, + "step": 6728 + }, + { + "epoch": 1.8196322336398052, + "grad_norm": 3.09375, + "learning_rate": 0.015231661156113739, + "loss": 3.2733, + "mean_token_accuracy": 0.355086088180542, + "num_tokens": 3440276559.0, + "step": 6729 + }, + { + "epoch": 1.8199026500811248, + "grad_norm": 2.8125, + "learning_rate": 0.015230269732104552, + "loss": 3.2421, + "mean_token_accuracy": 0.39330169558525085, + "num_tokens": 3440800759.0, + "step": 6730 + }, + { + "epoch": 1.8201730665224445, + "grad_norm": 15.625, + "learning_rate": 0.01522887817829825, + "loss": 10.2098, + "mean_token_accuracy": 0.008806261233985424, + "num_tokens": 3441301227.0, + "step": 6731 + }, + { + "epoch": 1.8204434829637641, + "grad_norm": 7.1875, + "learning_rate": 0.015227486494737532, + "loss": 3.8196, + "mean_token_accuracy": 0.3033154606819153, + "num_tokens": 3441825214.0, + "step": 6732 + }, + { + "epoch": 1.8207138994050838, + "grad_norm": 2.3125, + "learning_rate": 0.015226094681465097, + "loss": 3.5121, + "mean_token_accuracy": 0.35592570900917053, + "num_tokens": 3442349416.0, + "step": 6733 + }, + { + "epoch": 1.8209843158464034, + "grad_norm": 2.09375, + "learning_rate": 0.015224702738523646, + "loss": 3.3205, + "mean_token_accuracy": 0.37258806824684143, + "num_tokens": 3442873601.0, + "step": 6734 + }, + { + "epoch": 1.821254732287723, + "grad_norm": 2.75, + "learning_rate": 0.01522331066595589, + "loss": 3.2288, + "mean_token_accuracy": 0.3977254033088684, + "num_tokens": 3443343952.0, + "step": 6735 + }, + { + "epoch": 1.8215251487290427, + "grad_norm": 2.671875, + "learning_rate": 0.015221918463804548, + "loss": 3.2453, + "mean_token_accuracy": 0.4007396399974823, + "num_tokens": 3443868227.0, + "step": 6736 + }, + { + "epoch": 1.8217955651703623, + "grad_norm": 2.875, + "learning_rate": 0.015220526132112324, + "loss": 3.1555, + "mean_token_accuracy": 0.40274927020072937, + "num_tokens": 3444339880.0, + "step": 6737 + }, + { + "epoch": 1.822065981611682, + "grad_norm": 2.90625, + "learning_rate": 0.015219133670921953, + "loss": 3.12, + "mean_token_accuracy": 0.39464038610458374, + "num_tokens": 3444824046.0, + "step": 6738 + }, + { + "epoch": 1.8223363980530016, + "grad_norm": 2.796875, + "learning_rate": 0.01521774108027615, + "loss": 2.9393, + "mean_token_accuracy": 0.4373983144760132, + "num_tokens": 3445348223.0, + "step": 6739 + }, + { + "epoch": 1.8226068144943213, + "grad_norm": 2.265625, + "learning_rate": 0.015216348360217647, + "loss": 3.2287, + "mean_token_accuracy": 0.4051615595817566, + "num_tokens": 3445835974.0, + "step": 6740 + }, + { + "epoch": 1.822877230935641, + "grad_norm": 3.375, + "learning_rate": 0.015214955510789179, + "loss": 3.1991, + "mean_token_accuracy": 0.38289397954940796, + "num_tokens": 3446354008.0, + "step": 6741 + }, + { + "epoch": 1.8231476473769606, + "grad_norm": 2.265625, + "learning_rate": 0.01521356253203348, + "loss": 3.1841, + "mean_token_accuracy": 0.4104973077774048, + "num_tokens": 3446832771.0, + "step": 6742 + }, + { + "epoch": 1.8234180638182802, + "grad_norm": 3.078125, + "learning_rate": 0.015212169423993291, + "loss": 3.2017, + "mean_token_accuracy": 0.3705138564109802, + "num_tokens": 3447314699.0, + "step": 6743 + }, + { + "epoch": 1.8236884802595998, + "grad_norm": 2.765625, + "learning_rate": 0.015210776186711355, + "loss": 3.1009, + "mean_token_accuracy": 0.3892737627029419, + "num_tokens": 3447838744.0, + "step": 6744 + }, + { + "epoch": 1.8239588967009195, + "grad_norm": 2.96875, + "learning_rate": 0.015209382820230424, + "loss": 3.3204, + "mean_token_accuracy": 0.3823298215866089, + "num_tokens": 3448363017.0, + "step": 6745 + }, + { + "epoch": 1.8242293131422391, + "grad_norm": 3.703125, + "learning_rate": 0.01520798932459325, + "loss": 3.1694, + "mean_token_accuracy": 0.400285005569458, + "num_tokens": 3448887294.0, + "step": 6746 + }, + { + "epoch": 1.8244997295835588, + "grad_norm": 2.734375, + "learning_rate": 0.015206595699842586, + "loss": 3.1899, + "mean_token_accuracy": 0.3918757140636444, + "num_tokens": 3449411572.0, + "step": 6747 + }, + { + "epoch": 1.8247701460248784, + "grad_norm": 3.484375, + "learning_rate": 0.015205201946021197, + "loss": 3.2084, + "mean_token_accuracy": 0.4020794630050659, + "num_tokens": 3449935727.0, + "step": 6748 + }, + { + "epoch": 1.825040562466198, + "grad_norm": 3.34375, + "learning_rate": 0.015203808063171849, + "loss": 2.8437, + "mean_token_accuracy": 0.4073307514190674, + "num_tokens": 3450459992.0, + "step": 6749 + }, + { + "epoch": 1.8253109789075177, + "grad_norm": 3.984375, + "learning_rate": 0.015202414051337303, + "loss": 3.2607, + "mean_token_accuracy": 0.3975602984428406, + "num_tokens": 3450937126.0, + "step": 6750 + }, + { + "epoch": 1.8255813953488373, + "grad_norm": 114.0, + "learning_rate": 0.015201019910560333, + "loss": 14.1169, + "mean_token_accuracy": 0.035760290920734406, + "num_tokens": 3451461302.0, + "step": 6751 + }, + { + "epoch": 1.825851811790157, + "grad_norm": 6.40625, + "learning_rate": 0.015199625640883722, + "loss": 3.4635, + "mean_token_accuracy": 0.3164140284061432, + "num_tokens": 3451959022.0, + "step": 6752 + }, + { + "epoch": 1.8261222282314766, + "grad_norm": 2.34375, + "learning_rate": 0.015198231242350245, + "loss": 3.0914, + "mean_token_accuracy": 0.3836701512336731, + "num_tokens": 3452483221.0, + "step": 6753 + }, + { + "epoch": 1.826392644672796, + "grad_norm": 2.140625, + "learning_rate": 0.015196836715002687, + "loss": 3.1017, + "mean_token_accuracy": 0.41405877470970154, + "num_tokens": 3452947689.0, + "step": 6754 + }, + { + "epoch": 1.8266630611141157, + "grad_norm": 3.0625, + "learning_rate": 0.015195442058883837, + "loss": 3.2693, + "mean_token_accuracy": 0.36471027135849, + "num_tokens": 3453471744.0, + "step": 6755 + }, + { + "epoch": 1.8269334775554353, + "grad_norm": 3.140625, + "learning_rate": 0.015194047274036485, + "loss": 3.3869, + "mean_token_accuracy": 0.37595704197883606, + "num_tokens": 3453995955.0, + "step": 6756 + }, + { + "epoch": 1.827203893996755, + "grad_norm": 3.1875, + "learning_rate": 0.01519265236050343, + "loss": 3.1094, + "mean_token_accuracy": 0.3901016414165497, + "num_tokens": 3454470216.0, + "step": 6757 + }, + { + "epoch": 1.8274743104380746, + "grad_norm": 2.640625, + "learning_rate": 0.015191257318327472, + "loss": 3.3995, + "mean_token_accuracy": 0.3832263946533203, + "num_tokens": 3454994262.0, + "step": 6758 + }, + { + "epoch": 1.8277447268793943, + "grad_norm": 3.5, + "learning_rate": 0.015189862147551412, + "loss": 3.5058, + "mean_token_accuracy": 0.35685813426971436, + "num_tokens": 3455517743.0, + "step": 6759 + }, + { + "epoch": 1.828015143320714, + "grad_norm": 2.359375, + "learning_rate": 0.01518846684821806, + "loss": 3.1844, + "mean_token_accuracy": 0.38649624586105347, + "num_tokens": 3456041961.0, + "step": 6760 + }, + { + "epoch": 1.8282855597620336, + "grad_norm": 4.0, + "learning_rate": 0.015187071420370226, + "loss": 3.3455, + "mean_token_accuracy": 0.3758271336555481, + "num_tokens": 3456539253.0, + "step": 6761 + }, + { + "epoch": 1.8285559762033532, + "grad_norm": 2.609375, + "learning_rate": 0.015185675864050731, + "loss": 3.371, + "mean_token_accuracy": 0.3758964538574219, + "num_tokens": 3457063540.0, + "step": 6762 + }, + { + "epoch": 1.8288263926446728, + "grad_norm": 2.765625, + "learning_rate": 0.01518428017930239, + "loss": 2.9685, + "mean_token_accuracy": 0.3990400731563568, + "num_tokens": 3457587776.0, + "step": 6763 + }, + { + "epoch": 1.8290968090859925, + "grad_norm": 2.296875, + "learning_rate": 0.015182884366168025, + "loss": 3.1593, + "mean_token_accuracy": 0.3840702772140503, + "num_tokens": 3458111912.0, + "step": 6764 + }, + { + "epoch": 1.829367225527312, + "grad_norm": 2.609375, + "learning_rate": 0.01518148842469047, + "loss": 3.1754, + "mean_token_accuracy": 0.3917946219444275, + "num_tokens": 3458636127.0, + "step": 6765 + }, + { + "epoch": 1.8296376419686315, + "grad_norm": 2.359375, + "learning_rate": 0.015180092354912552, + "loss": 3.0511, + "mean_token_accuracy": 0.4040980637073517, + "num_tokens": 3459160290.0, + "step": 6766 + }, + { + "epoch": 1.8299080584099512, + "grad_norm": 2.46875, + "learning_rate": 0.015178696156877107, + "loss": 3.2435, + "mean_token_accuracy": 0.37155625224113464, + "num_tokens": 3459684273.0, + "step": 6767 + }, + { + "epoch": 1.8301784748512708, + "grad_norm": 2.359375, + "learning_rate": 0.015177299830626977, + "loss": 3.2146, + "mean_token_accuracy": 0.4296138286590576, + "num_tokens": 3460145166.0, + "step": 6768 + }, + { + "epoch": 1.8304488912925905, + "grad_norm": 2.4375, + "learning_rate": 0.015175903376205002, + "loss": 3.0749, + "mean_token_accuracy": 0.3910423219203949, + "num_tokens": 3460669447.0, + "step": 6769 + }, + { + "epoch": 1.8307193077339101, + "grad_norm": 2.703125, + "learning_rate": 0.015174506793654032, + "loss": 3.1173, + "mean_token_accuracy": 0.3977334201335907, + "num_tokens": 3461193575.0, + "step": 6770 + }, + { + "epoch": 1.8309897241752298, + "grad_norm": 208.0, + "learning_rate": 0.015173110083016918, + "loss": 15.3466, + "mean_token_accuracy": 0.0, + "num_tokens": 3461717826.0, + "step": 6771 + }, + { + "epoch": 1.8312601406165494, + "grad_norm": 7.875, + "learning_rate": 0.015171713244336511, + "loss": 3.4409, + "mean_token_accuracy": 0.3721840977668762, + "num_tokens": 3462242060.0, + "step": 6772 + }, + { + "epoch": 1.831530557057869, + "grad_norm": 3.21875, + "learning_rate": 0.015170316277655676, + "loss": 3.2705, + "mean_token_accuracy": 0.40056848526000977, + "num_tokens": 3462715374.0, + "step": 6773 + }, + { + "epoch": 1.8318009734991887, + "grad_norm": 4.09375, + "learning_rate": 0.015168919183017271, + "loss": 3.4795, + "mean_token_accuracy": 0.38607263565063477, + "num_tokens": 3463194965.0, + "step": 6774 + }, + { + "epoch": 1.8320713899405083, + "grad_norm": 2.859375, + "learning_rate": 0.015167521960464172, + "loss": 3.4595, + "mean_token_accuracy": 0.34385883808135986, + "num_tokens": 3463719234.0, + "step": 6775 + }, + { + "epoch": 1.832341806381828, + "grad_norm": 2.375, + "learning_rate": 0.015166124610039237, + "loss": 3.2145, + "mean_token_accuracy": 0.3955504298210144, + "num_tokens": 3464243352.0, + "step": 6776 + }, + { + "epoch": 1.8326122228231476, + "grad_norm": 2.828125, + "learning_rate": 0.015164727131785348, + "loss": 3.1233, + "mean_token_accuracy": 0.3936782479286194, + "num_tokens": 3464767582.0, + "step": 6777 + }, + { + "epoch": 1.8328826392644673, + "grad_norm": 2.3125, + "learning_rate": 0.015163329525745386, + "loss": 3.2629, + "mean_token_accuracy": 0.3868218660354614, + "num_tokens": 3465291785.0, + "step": 6778 + }, + { + "epoch": 1.833153055705787, + "grad_norm": 2.765625, + "learning_rate": 0.015161931791962228, + "loss": 3.3074, + "mean_token_accuracy": 0.38795942068099976, + "num_tokens": 3465758012.0, + "step": 6779 + }, + { + "epoch": 1.8334234721471065, + "grad_norm": 2.15625, + "learning_rate": 0.015160533930478764, + "loss": 3.3131, + "mean_token_accuracy": 0.3926783800125122, + "num_tokens": 3466230604.0, + "step": 6780 + }, + { + "epoch": 1.8336938885884262, + "grad_norm": 2.859375, + "learning_rate": 0.01515913594133788, + "loss": 3.2751, + "mean_token_accuracy": 0.3627701997756958, + "num_tokens": 3466754827.0, + "step": 6781 + }, + { + "epoch": 1.8339643050297458, + "grad_norm": 2.78125, + "learning_rate": 0.01515773782458248, + "loss": 3.2424, + "mean_token_accuracy": 0.3869560658931732, + "num_tokens": 3467279106.0, + "step": 6782 + }, + { + "epoch": 1.8342347214710655, + "grad_norm": 2.859375, + "learning_rate": 0.015156339580255454, + "loss": 3.0602, + "mean_token_accuracy": 0.40174904465675354, + "num_tokens": 3467803352.0, + "step": 6783 + }, + { + "epoch": 1.8345051379123851, + "grad_norm": 2.828125, + "learning_rate": 0.015154941208399707, + "loss": 3.3181, + "mean_token_accuracy": 0.38840681314468384, + "num_tokens": 3468327612.0, + "step": 6784 + }, + { + "epoch": 1.8347755543537048, + "grad_norm": 3.234375, + "learning_rate": 0.015153542709058143, + "loss": 3.2076, + "mean_token_accuracy": 0.3926740884780884, + "num_tokens": 3468829013.0, + "step": 6785 + }, + { + "epoch": 1.8350459707950244, + "grad_norm": 2.984375, + "learning_rate": 0.015152144082273675, + "loss": 3.138, + "mean_token_accuracy": 0.39837223291397095, + "num_tokens": 3469353154.0, + "step": 6786 + }, + { + "epoch": 1.835316387236344, + "grad_norm": 3.28125, + "learning_rate": 0.015150745328089217, + "loss": 3.1053, + "mean_token_accuracy": 0.38224953413009644, + "num_tokens": 3469877359.0, + "step": 6787 + }, + { + "epoch": 1.8355868036776637, + "grad_norm": 2.875, + "learning_rate": 0.015149346446547685, + "loss": 3.2338, + "mean_token_accuracy": 0.43105316162109375, + "num_tokens": 3470278188.0, + "step": 6788 + }, + { + "epoch": 1.8358572201189833, + "grad_norm": 3.09375, + "learning_rate": 0.015147947437692003, + "loss": 2.9472, + "mean_token_accuracy": 0.38742613792419434, + "num_tokens": 3470802466.0, + "step": 6789 + }, + { + "epoch": 1.836127636560303, + "grad_norm": 3.125, + "learning_rate": 0.015146548301565093, + "loss": 3.182, + "mean_token_accuracy": 0.3870556652545929, + "num_tokens": 3471326740.0, + "step": 6790 + }, + { + "epoch": 1.8363980530016226, + "grad_norm": 3.78125, + "learning_rate": 0.015145149038209887, + "loss": 11.9102, + "mean_token_accuracy": 1.367895856674295e-05, + "num_tokens": 3471850985.0, + "step": 6791 + }, + { + "epoch": 1.8366684694429423, + "grad_norm": 9.5, + "learning_rate": 0.01514374964766932, + "loss": 3.7451, + "mean_token_accuracy": 0.31175416707992554, + "num_tokens": 3472375263.0, + "step": 6792 + }, + { + "epoch": 1.836938885884262, + "grad_norm": 2.203125, + "learning_rate": 0.015142350129986326, + "loss": 3.4941, + "mean_token_accuracy": 0.3702029883861542, + "num_tokens": 3472899383.0, + "step": 6793 + }, + { + "epoch": 1.8372093023255816, + "grad_norm": 2.71875, + "learning_rate": 0.015140950485203851, + "loss": 3.3154, + "mean_token_accuracy": 0.3755033314228058, + "num_tokens": 3473399779.0, + "step": 6794 + }, + { + "epoch": 1.8374797187669012, + "grad_norm": 2.875, + "learning_rate": 0.015139550713364834, + "loss": 3.2521, + "mean_token_accuracy": 0.3845883011817932, + "num_tokens": 3473924032.0, + "step": 6795 + }, + { + "epoch": 1.8377501352082206, + "grad_norm": 3.09375, + "learning_rate": 0.01513815081451223, + "loss": 3.2588, + "mean_token_accuracy": 0.368480920791626, + "num_tokens": 3474448159.0, + "step": 6796 + }, + { + "epoch": 1.8380205516495403, + "grad_norm": 3.28125, + "learning_rate": 0.015136750788688988, + "loss": 3.4337, + "mean_token_accuracy": 0.39708828926086426, + "num_tokens": 3474911850.0, + "step": 6797 + }, + { + "epoch": 1.83829096809086, + "grad_norm": 3.84375, + "learning_rate": 0.015135350635938068, + "loss": 3.4198, + "mean_token_accuracy": 0.376888632774353, + "num_tokens": 3475408898.0, + "step": 6798 + }, + { + "epoch": 1.8385613845321795, + "grad_norm": 2.75, + "learning_rate": 0.015133950356302429, + "loss": 3.3758, + "mean_token_accuracy": 0.3842605948448181, + "num_tokens": 3475933027.0, + "step": 6799 + }, + { + "epoch": 1.8388318009734992, + "grad_norm": 3.53125, + "learning_rate": 0.015132549949825035, + "loss": 3.027, + "mean_token_accuracy": 0.3947230577468872, + "num_tokens": 3476457302.0, + "step": 6800 + }, + { + "epoch": 1.8391022174148188, + "grad_norm": 2.546875, + "learning_rate": 0.015131149416548854, + "loss": 3.3091, + "mean_token_accuracy": 0.3748359680175781, + "num_tokens": 3476973679.0, + "step": 6801 + }, + { + "epoch": 1.8393726338561385, + "grad_norm": 3.546875, + "learning_rate": 0.015129748756516861, + "loss": 3.2259, + "mean_token_accuracy": 0.3837076425552368, + "num_tokens": 3477497871.0, + "step": 6802 + }, + { + "epoch": 1.8396430502974581, + "grad_norm": 3.171875, + "learning_rate": 0.015128347969772032, + "loss": 3.2406, + "mean_token_accuracy": 0.38689249753952026, + "num_tokens": 3478022122.0, + "step": 6803 + }, + { + "epoch": 1.8399134667387778, + "grad_norm": 3.03125, + "learning_rate": 0.015126947056357348, + "loss": 3.4639, + "mean_token_accuracy": 0.3478957712650299, + "num_tokens": 3478546373.0, + "step": 6804 + }, + { + "epoch": 1.8401838831800974, + "grad_norm": 4.03125, + "learning_rate": 0.015125546016315789, + "loss": 3.2761, + "mean_token_accuracy": 0.37714093923568726, + "num_tokens": 3479070541.0, + "step": 6805 + }, + { + "epoch": 1.8404542996214168, + "grad_norm": 3.359375, + "learning_rate": 0.015124144849690348, + "loss": 3.0368, + "mean_token_accuracy": 0.39820414781570435, + "num_tokens": 3479594634.0, + "step": 6806 + }, + { + "epoch": 1.8407247160627365, + "grad_norm": 2.421875, + "learning_rate": 0.015122743556524015, + "loss": 3.2921, + "mean_token_accuracy": 0.39746391773223877, + "num_tokens": 3480066815.0, + "step": 6807 + }, + { + "epoch": 1.840995132504056, + "grad_norm": 2.921875, + "learning_rate": 0.015121342136859783, + "loss": 3.3181, + "mean_token_accuracy": 0.3721659183502197, + "num_tokens": 3480591087.0, + "step": 6808 + }, + { + "epoch": 1.8412655489453758, + "grad_norm": 3.140625, + "learning_rate": 0.015119940590740654, + "loss": 3.333, + "mean_token_accuracy": 0.3746735453605652, + "num_tokens": 3481115370.0, + "step": 6809 + }, + { + "epoch": 1.8415359653866954, + "grad_norm": 3.1875, + "learning_rate": 0.015118538918209634, + "loss": 3.4937, + "mean_token_accuracy": 0.376761257648468, + "num_tokens": 3481603000.0, + "step": 6810 + }, + { + "epoch": 1.841806381828015, + "grad_norm": 268.0, + "learning_rate": 0.015117137119309728, + "loss": 15.7883, + "mean_token_accuracy": 0.026790212839841843, + "num_tokens": 3482127075.0, + "step": 6811 + }, + { + "epoch": 1.8420767982693347, + "grad_norm": 5.3125, + "learning_rate": 0.015115735194083949, + "loss": 3.4798, + "mean_token_accuracy": 0.3542546033859253, + "num_tokens": 3482651294.0, + "step": 6812 + }, + { + "epoch": 1.8423472147106543, + "grad_norm": 2.03125, + "learning_rate": 0.015114333142575305, + "loss": 3.4331, + "mean_token_accuracy": 0.38254839181900024, + "num_tokens": 3483129285.0, + "step": 6813 + }, + { + "epoch": 1.842617631151974, + "grad_norm": 3.25, + "learning_rate": 0.015112930964826826, + "loss": 3.2703, + "mean_token_accuracy": 0.3742678165435791, + "num_tokens": 3483653462.0, + "step": 6814 + }, + { + "epoch": 1.8428880475932936, + "grad_norm": 3.90625, + "learning_rate": 0.015111528660881526, + "loss": 3.1372, + "mean_token_accuracy": 0.3906160593032837, + "num_tokens": 3484177741.0, + "step": 6815 + }, + { + "epoch": 1.8431584640346133, + "grad_norm": 3.171875, + "learning_rate": 0.015110126230782438, + "loss": 3.3192, + "mean_token_accuracy": 0.3569825291633606, + "num_tokens": 3484702020.0, + "step": 6816 + }, + { + "epoch": 1.843428880475933, + "grad_norm": 3.0625, + "learning_rate": 0.015108723674572588, + "loss": 3.4266, + "mean_token_accuracy": 0.3793826699256897, + "num_tokens": 3485226071.0, + "step": 6817 + }, + { + "epoch": 1.8436992969172525, + "grad_norm": 2.90625, + "learning_rate": 0.015107320992295016, + "loss": 3.3488, + "mean_token_accuracy": 0.37542760372161865, + "num_tokens": 3485750223.0, + "step": 6818 + }, + { + "epoch": 1.8439697133585722, + "grad_norm": 2.765625, + "learning_rate": 0.015105918183992754, + "loss": 3.1816, + "mean_token_accuracy": 0.40439778566360474, + "num_tokens": 3486252696.0, + "step": 6819 + }, + { + "epoch": 1.8442401297998918, + "grad_norm": 2.578125, + "learning_rate": 0.015104515249708848, + "loss": 3.1596, + "mean_token_accuracy": 0.3667880892753601, + "num_tokens": 3486776877.0, + "step": 6820 + }, + { + "epoch": 1.8445105462412115, + "grad_norm": 3.0625, + "learning_rate": 0.015103112189486345, + "loss": 3.1603, + "mean_token_accuracy": 0.41776740550994873, + "num_tokens": 3487226637.0, + "step": 6821 + }, + { + "epoch": 1.8447809626825311, + "grad_norm": 3.09375, + "learning_rate": 0.015101709003368291, + "loss": 3.3475, + "mean_token_accuracy": 0.3797411024570465, + "num_tokens": 3487750840.0, + "step": 6822 + }, + { + "epoch": 1.8450513791238508, + "grad_norm": 3.09375, + "learning_rate": 0.015100305691397744, + "loss": 3.3, + "mean_token_accuracy": 0.39726608991622925, + "num_tokens": 3488274917.0, + "step": 6823 + }, + { + "epoch": 1.8453217955651704, + "grad_norm": 2.953125, + "learning_rate": 0.015098902253617763, + "loss": 3.3507, + "mean_token_accuracy": 0.3736763000488281, + "num_tokens": 3488799123.0, + "step": 6824 + }, + { + "epoch": 1.84559221200649, + "grad_norm": 2.859375, + "learning_rate": 0.015097498690071404, + "loss": 3.1549, + "mean_token_accuracy": 0.3792576491832733, + "num_tokens": 3489212610.0, + "step": 6825 + }, + { + "epoch": 1.8458626284478097, + "grad_norm": 3.15625, + "learning_rate": 0.015096095000801736, + "loss": 3.1433, + "mean_token_accuracy": 0.3771492838859558, + "num_tokens": 3489736709.0, + "step": 6826 + }, + { + "epoch": 1.8461330448891293, + "grad_norm": 2.84375, + "learning_rate": 0.015094691185851826, + "loss": 3.3111, + "mean_token_accuracy": 0.3630896210670471, + "num_tokens": 3490257072.0, + "step": 6827 + }, + { + "epoch": 1.846403461330449, + "grad_norm": 2.96875, + "learning_rate": 0.015093287245264752, + "loss": 3.2913, + "mean_token_accuracy": 0.3772154450416565, + "num_tokens": 3490781252.0, + "step": 6828 + }, + { + "epoch": 1.8466738777717686, + "grad_norm": 3.203125, + "learning_rate": 0.015091883179083585, + "loss": 3.1293, + "mean_token_accuracy": 0.39924782514572144, + "num_tokens": 3491305527.0, + "step": 6829 + }, + { + "epoch": 1.8469442942130883, + "grad_norm": 3.859375, + "learning_rate": 0.015090478987351406, + "loss": 3.4686, + "mean_token_accuracy": 0.37290459871292114, + "num_tokens": 3491829705.0, + "step": 6830 + }, + { + "epoch": 1.847214710654408, + "grad_norm": 5.25, + "learning_rate": 0.015089074670111304, + "loss": 10.5593, + "mean_token_accuracy": 2.2176354832481593e-05, + "num_tokens": 3492268253.0, + "step": 6831 + }, + { + "epoch": 1.8474851270957275, + "grad_norm": 10.75, + "learning_rate": 0.015087670227406369, + "loss": 3.9559, + "mean_token_accuracy": 0.3153471052646637, + "num_tokens": 3492751135.0, + "step": 6832 + }, + { + "epoch": 1.8477555435370472, + "grad_norm": 2.265625, + "learning_rate": 0.015086265659279688, + "loss": 3.466, + "mean_token_accuracy": 0.36399075388908386, + "num_tokens": 3493220255.0, + "step": 6833 + }, + { + "epoch": 1.8480259599783668, + "grad_norm": 2.921875, + "learning_rate": 0.015084860965774364, + "loss": 3.447, + "mean_token_accuracy": 0.3587435781955719, + "num_tokens": 3493744507.0, + "step": 6834 + }, + { + "epoch": 1.8482963764196865, + "grad_norm": 3.625, + "learning_rate": 0.015083456146933488, + "loss": 3.4379, + "mean_token_accuracy": 0.37905341386795044, + "num_tokens": 3494268687.0, + "step": 6835 + }, + { + "epoch": 1.8485667928610061, + "grad_norm": 3.390625, + "learning_rate": 0.015082051202800166, + "loss": 3.236, + "mean_token_accuracy": 0.3789486885070801, + "num_tokens": 3494745094.0, + "step": 6836 + }, + { + "epoch": 1.8488372093023255, + "grad_norm": 2.890625, + "learning_rate": 0.015080646133417513, + "loss": 3.2974, + "mean_token_accuracy": 0.3804124593734741, + "num_tokens": 3495269316.0, + "step": 6837 + }, + { + "epoch": 1.8491076257436452, + "grad_norm": 3.25, + "learning_rate": 0.015079240938828635, + "loss": 3.2199, + "mean_token_accuracy": 0.3868614733219147, + "num_tokens": 3495793566.0, + "step": 6838 + }, + { + "epoch": 1.8493780421849648, + "grad_norm": 3.328125, + "learning_rate": 0.01507783561907665, + "loss": 3.4433, + "mean_token_accuracy": 0.3376308083534241, + "num_tokens": 3496317714.0, + "step": 6839 + }, + { + "epoch": 1.8496484586262845, + "grad_norm": 2.9375, + "learning_rate": 0.015076430174204677, + "loss": 3.0796, + "mean_token_accuracy": 0.3903254270553589, + "num_tokens": 3496841922.0, + "step": 6840 + }, + { + "epoch": 1.849918875067604, + "grad_norm": 2.90625, + "learning_rate": 0.015075024604255836, + "loss": 3.1382, + "mean_token_accuracy": 0.385764479637146, + "num_tokens": 3497366130.0, + "step": 6841 + }, + { + "epoch": 1.8501892915089238, + "grad_norm": 2.875, + "learning_rate": 0.015073618909273256, + "loss": 3.145, + "mean_token_accuracy": 0.3849780857563019, + "num_tokens": 3497879013.0, + "step": 6842 + }, + { + "epoch": 1.8504597079502434, + "grad_norm": 2.5625, + "learning_rate": 0.01507221308930007, + "loss": 3.1878, + "mean_token_accuracy": 0.40814000368118286, + "num_tokens": 3498403140.0, + "step": 6843 + }, + { + "epoch": 1.850730124391563, + "grad_norm": 2.453125, + "learning_rate": 0.01507080714437941, + "loss": 3.1349, + "mean_token_accuracy": 0.3636619746685028, + "num_tokens": 3498927280.0, + "step": 6844 + }, + { + "epoch": 1.8510005408328827, + "grad_norm": 2.390625, + "learning_rate": 0.015069401074554414, + "loss": 3.0094, + "mean_token_accuracy": 0.4150221049785614, + "num_tokens": 3499392942.0, + "step": 6845 + }, + { + "epoch": 1.8512709572742023, + "grad_norm": 2.5, + "learning_rate": 0.015067994879868228, + "loss": 3.2701, + "mean_token_accuracy": 0.4008830487728119, + "num_tokens": 3499877019.0, + "step": 6846 + }, + { + "epoch": 1.8515413737155217, + "grad_norm": 2.984375, + "learning_rate": 0.015066588560363997, + "loss": 3.101, + "mean_token_accuracy": 0.3967618942260742, + "num_tokens": 3500401236.0, + "step": 6847 + }, + { + "epoch": 1.8518117901568414, + "grad_norm": 3.578125, + "learning_rate": 0.015065182116084866, + "loss": 2.9715, + "mean_token_accuracy": 0.437191903591156, + "num_tokens": 3500925520.0, + "step": 6848 + }, + { + "epoch": 1.852082206598161, + "grad_norm": 3.375, + "learning_rate": 0.015063775547073995, + "loss": 3.1225, + "mean_token_accuracy": 0.40004464983940125, + "num_tokens": 3501449627.0, + "step": 6849 + }, + { + "epoch": 1.8523526230394807, + "grad_norm": 3.53125, + "learning_rate": 0.015062368853374537, + "loss": 3.34, + "mean_token_accuracy": 0.3806247413158417, + "num_tokens": 3501973908.0, + "step": 6850 + }, + { + "epoch": 1.8526230394808003, + "grad_norm": 119.0, + "learning_rate": 0.015060962035029659, + "loss": 24.4912, + "mean_token_accuracy": 0.03723473474383354, + "num_tokens": 3502498174.0, + "step": 6851 + }, + { + "epoch": 1.85289345592212, + "grad_norm": 5.28125, + "learning_rate": 0.015059555092082523, + "loss": 3.5162, + "mean_token_accuracy": 0.3658715486526489, + "num_tokens": 3503022310.0, + "step": 6852 + }, + { + "epoch": 1.8531638723634396, + "grad_norm": 2.078125, + "learning_rate": 0.015058148024576294, + "loss": 3.4597, + "mean_token_accuracy": 0.38362497091293335, + "num_tokens": 3503546284.0, + "step": 6853 + }, + { + "epoch": 1.8534342888047592, + "grad_norm": 2.484375, + "learning_rate": 0.015056740832554157, + "loss": 3.1689, + "mean_token_accuracy": 0.368633896112442, + "num_tokens": 3504070532.0, + "step": 6854 + }, + { + "epoch": 1.8537047052460789, + "grad_norm": 7.625, + "learning_rate": 0.015055333516059278, + "loss": 3.2175, + "mean_token_accuracy": 0.3937680721282959, + "num_tokens": 3504574015.0, + "step": 6855 + }, + { + "epoch": 1.8539751216873985, + "grad_norm": 1.9140625, + "learning_rate": 0.015053926075134836, + "loss": 3.2401, + "mean_token_accuracy": 0.38648727536201477, + "num_tokens": 3505098251.0, + "step": 6856 + }, + { + "epoch": 1.8542455381287182, + "grad_norm": 3.34375, + "learning_rate": 0.015052518509824024, + "loss": 3.2718, + "mean_token_accuracy": 0.3824115991592407, + "num_tokens": 3505622482.0, + "step": 6857 + }, + { + "epoch": 1.8545159545700378, + "grad_norm": 3.640625, + "learning_rate": 0.015051110820170027, + "loss": 3.3031, + "mean_token_accuracy": 0.4039512276649475, + "num_tokens": 3506086357.0, + "step": 6858 + }, + { + "epoch": 1.8547863710113575, + "grad_norm": 2.578125, + "learning_rate": 0.015049703006216036, + "loss": 3.2089, + "mean_token_accuracy": 0.39572852849960327, + "num_tokens": 3506610579.0, + "step": 6859 + }, + { + "epoch": 1.855056787452677, + "grad_norm": 3.578125, + "learning_rate": 0.015048295068005247, + "loss": 3.3844, + "mean_token_accuracy": 0.3775380253791809, + "num_tokens": 3507134738.0, + "step": 6860 + }, + { + "epoch": 1.8553272038939967, + "grad_norm": 3.125, + "learning_rate": 0.01504688700558086, + "loss": 3.0946, + "mean_token_accuracy": 0.3969147503376007, + "num_tokens": 3507658983.0, + "step": 6861 + }, + { + "epoch": 1.8555976203353164, + "grad_norm": 3.078125, + "learning_rate": 0.01504547881898608, + "loss": 3.4116, + "mean_token_accuracy": 0.3606654405593872, + "num_tokens": 3508183249.0, + "step": 6862 + }, + { + "epoch": 1.855868036776636, + "grad_norm": 3.953125, + "learning_rate": 0.015044070508264111, + "loss": 3.2599, + "mean_token_accuracy": 0.3902856707572937, + "num_tokens": 3508704117.0, + "step": 6863 + }, + { + "epoch": 1.8561384532179557, + "grad_norm": 3.859375, + "learning_rate": 0.015042662073458168, + "loss": 3.1841, + "mean_token_accuracy": 0.3849963843822479, + "num_tokens": 3509170450.0, + "step": 6864 + }, + { + "epoch": 1.8564088696592753, + "grad_norm": 2.828125, + "learning_rate": 0.015041253514611462, + "loss": 3.3171, + "mean_token_accuracy": 0.3822106122970581, + "num_tokens": 3509694626.0, + "step": 6865 + }, + { + "epoch": 1.856679286100595, + "grad_norm": 2.953125, + "learning_rate": 0.015039844831767213, + "loss": 3.1891, + "mean_token_accuracy": 0.3607241213321686, + "num_tokens": 3510218791.0, + "step": 6866 + }, + { + "epoch": 1.8569497025419146, + "grad_norm": 2.328125, + "learning_rate": 0.015038436024968647, + "loss": 3.3635, + "mean_token_accuracy": 0.37568753957748413, + "num_tokens": 3510742955.0, + "step": 6867 + }, + { + "epoch": 1.8572201189832342, + "grad_norm": 3.921875, + "learning_rate": 0.015037027094258986, + "loss": 3.308, + "mean_token_accuracy": 0.3974396586418152, + "num_tokens": 3511220489.0, + "step": 6868 + }, + { + "epoch": 1.857490535424554, + "grad_norm": 2.609375, + "learning_rate": 0.01503561803968146, + "loss": 3.3514, + "mean_token_accuracy": 0.37074682116508484, + "num_tokens": 3511744744.0, + "step": 6869 + }, + { + "epoch": 1.8577609518658735, + "grad_norm": 3.046875, + "learning_rate": 0.015034208861279309, + "loss": 3.3895, + "mean_token_accuracy": 0.3628581464290619, + "num_tokens": 3512269020.0, + "step": 6870 + }, + { + "epoch": 1.8580313683071932, + "grad_norm": 15.9375, + "learning_rate": 0.015032799559095762, + "loss": 18.7533, + "mean_token_accuracy": 1.972293148355675e-06, + "num_tokens": 3512793202.0, + "step": 6871 + }, + { + "epoch": 1.8583017847485128, + "grad_norm": 7.71875, + "learning_rate": 0.015031390133174062, + "loss": 3.3611, + "mean_token_accuracy": 0.3656531870365143, + "num_tokens": 3513272878.0, + "step": 6872 + }, + { + "epoch": 1.8585722011898325, + "grad_norm": 2.953125, + "learning_rate": 0.015029980583557466, + "loss": 3.3735, + "mean_token_accuracy": 0.3935783803462982, + "num_tokens": 3513743510.0, + "step": 6873 + }, + { + "epoch": 1.858842617631152, + "grad_norm": 2.46875, + "learning_rate": 0.01502857091028921, + "loss": 3.3853, + "mean_token_accuracy": 0.3743131458759308, + "num_tokens": 3514212942.0, + "step": 6874 + }, + { + "epoch": 1.8591130340724717, + "grad_norm": 2.765625, + "learning_rate": 0.015027161113412551, + "loss": 3.3266, + "mean_token_accuracy": 0.39939457178115845, + "num_tokens": 3514721724.0, + "step": 6875 + }, + { + "epoch": 1.8593834505137914, + "grad_norm": 2.78125, + "learning_rate": 0.015025751192970744, + "loss": 3.3844, + "mean_token_accuracy": 0.36793532967567444, + "num_tokens": 3515245942.0, + "step": 6876 + }, + { + "epoch": 1.859653866955111, + "grad_norm": 14.375, + "learning_rate": 0.015024341149007055, + "loss": 2.8136, + "mean_token_accuracy": 0.44213271141052246, + "num_tokens": 3515727477.0, + "step": 6877 + }, + { + "epoch": 1.8599242833964305, + "grad_norm": 3.265625, + "learning_rate": 0.015022930981564741, + "loss": 3.4631, + "mean_token_accuracy": 0.3887327313423157, + "num_tokens": 3516251737.0, + "step": 6878 + }, + { + "epoch": 1.86019469983775, + "grad_norm": 2.25, + "learning_rate": 0.015021520690687075, + "loss": 3.2733, + "mean_token_accuracy": 0.38729149103164673, + "num_tokens": 3516775832.0, + "step": 6879 + }, + { + "epoch": 1.8604651162790697, + "grad_norm": 3.5625, + "learning_rate": 0.015020110276417331, + "loss": 3.2818, + "mean_token_accuracy": 0.3669670522212982, + "num_tokens": 3517300095.0, + "step": 6880 + }, + { + "epoch": 1.8607355327203894, + "grad_norm": 2.484375, + "learning_rate": 0.015018699738798777, + "loss": 3.0326, + "mean_token_accuracy": 0.39635878801345825, + "num_tokens": 3517824356.0, + "step": 6881 + }, + { + "epoch": 1.861005949161709, + "grad_norm": 2.265625, + "learning_rate": 0.015017289077874698, + "loss": 3.2948, + "mean_token_accuracy": 0.3919150233268738, + "num_tokens": 3518300984.0, + "step": 6882 + }, + { + "epoch": 1.8612763656030287, + "grad_norm": 3.265625, + "learning_rate": 0.015015878293688379, + "loss": 3.2179, + "mean_token_accuracy": 0.369254469871521, + "num_tokens": 3518825077.0, + "step": 6883 + }, + { + "epoch": 1.8615467820443483, + "grad_norm": 2.859375, + "learning_rate": 0.015014467386283101, + "loss": 3.2873, + "mean_token_accuracy": 0.38304606080055237, + "num_tokens": 3519349277.0, + "step": 6884 + }, + { + "epoch": 1.861817198485668, + "grad_norm": 2.796875, + "learning_rate": 0.015013056355702157, + "loss": 3.131, + "mean_token_accuracy": 0.3909258246421814, + "num_tokens": 3519873558.0, + "step": 6885 + }, + { + "epoch": 1.8620876149269876, + "grad_norm": 2.171875, + "learning_rate": 0.015011645201988846, + "loss": 3.2415, + "mean_token_accuracy": 0.3824400305747986, + "num_tokens": 3520397754.0, + "step": 6886 + }, + { + "epoch": 1.8623580313683072, + "grad_norm": 2.359375, + "learning_rate": 0.015010233925186458, + "loss": 3.2418, + "mean_token_accuracy": 0.38928472995758057, + "num_tokens": 3520911284.0, + "step": 6887 + }, + { + "epoch": 1.8626284478096267, + "grad_norm": 2.53125, + "learning_rate": 0.015008822525338304, + "loss": 3.1379, + "mean_token_accuracy": 0.38603028655052185, + "num_tokens": 3521435491.0, + "step": 6888 + }, + { + "epoch": 1.8628988642509463, + "grad_norm": 3.359375, + "learning_rate": 0.015007411002487684, + "loss": 3.1726, + "mean_token_accuracy": 0.3713245987892151, + "num_tokens": 3521959624.0, + "step": 6889 + }, + { + "epoch": 1.863169280692266, + "grad_norm": 2.546875, + "learning_rate": 0.01500599935667791, + "loss": 3.3046, + "mean_token_accuracy": 0.41744935512542725, + "num_tokens": 3522421939.0, + "step": 6890 + }, + { + "epoch": 1.8634396971335856, + "grad_norm": 3.765625, + "learning_rate": 0.015004587587952294, + "loss": 9.3252, + "mean_token_accuracy": 0.006552963517606258, + "num_tokens": 3522895154.0, + "step": 6891 + }, + { + "epoch": 1.8637101135749052, + "grad_norm": 9.5, + "learning_rate": 0.015003175696354154, + "loss": 4.0064, + "mean_token_accuracy": 0.2988991141319275, + "num_tokens": 3523419437.0, + "step": 6892 + }, + { + "epoch": 1.8639805300162249, + "grad_norm": 2.78125, + "learning_rate": 0.015001763681926811, + "loss": 3.3955, + "mean_token_accuracy": 0.3694142699241638, + "num_tokens": 3523943685.0, + "step": 6893 + }, + { + "epoch": 1.8642509464575445, + "grad_norm": 2.5625, + "learning_rate": 0.01500035154471359, + "loss": 3.3346, + "mean_token_accuracy": 0.3869416117668152, + "num_tokens": 3524467904.0, + "step": 6894 + }, + { + "epoch": 1.8645213628988642, + "grad_norm": 3.65625, + "learning_rate": 0.014998939284757823, + "loss": 3.2149, + "mean_token_accuracy": 0.3920387327671051, + "num_tokens": 3524981667.0, + "step": 6895 + }, + { + "epoch": 1.8647917793401838, + "grad_norm": 3.15625, + "learning_rate": 0.014997526902102834, + "loss": 3.1174, + "mean_token_accuracy": 0.3544269800186157, + "num_tokens": 3525505686.0, + "step": 6896 + }, + { + "epoch": 1.8650621957815035, + "grad_norm": 3.359375, + "learning_rate": 0.014996114396791965, + "loss": 3.4429, + "mean_token_accuracy": 0.36986255645751953, + "num_tokens": 3526029743.0, + "step": 6897 + }, + { + "epoch": 1.865332612222823, + "grad_norm": 3.671875, + "learning_rate": 0.014994701768868555, + "loss": 3.2359, + "mean_token_accuracy": 0.382464736700058, + "num_tokens": 3526554013.0, + "step": 6898 + }, + { + "epoch": 1.8656030286641427, + "grad_norm": 2.671875, + "learning_rate": 0.014993289018375947, + "loss": 3.0883, + "mean_token_accuracy": 0.38272613286972046, + "num_tokens": 3527041069.0, + "step": 6899 + }, + { + "epoch": 1.8658734451054624, + "grad_norm": 2.71875, + "learning_rate": 0.014991876145357488, + "loss": 2.9921, + "mean_token_accuracy": 0.40233591198921204, + "num_tokens": 3527515871.0, + "step": 6900 + }, + { + "epoch": 1.866143861546782, + "grad_norm": 2.609375, + "learning_rate": 0.014990463149856527, + "loss": 3.343, + "mean_token_accuracy": 0.3840065002441406, + "num_tokens": 3528039943.0, + "step": 6901 + }, + { + "epoch": 1.8664142779881017, + "grad_norm": 3.078125, + "learning_rate": 0.014989050031916425, + "loss": 3.1334, + "mean_token_accuracy": 0.39273905754089355, + "num_tokens": 3528564159.0, + "step": 6902 + }, + { + "epoch": 1.8666846944294213, + "grad_norm": 5.5625, + "learning_rate": 0.014987636791580536, + "loss": 3.092, + "mean_token_accuracy": 0.4416964650154114, + "num_tokens": 3529022949.0, + "step": 6903 + }, + { + "epoch": 1.866955110870741, + "grad_norm": 2.5625, + "learning_rate": 0.014986223428892222, + "loss": 2.7656, + "mean_token_accuracy": 0.4173022508621216, + "num_tokens": 3529547218.0, + "step": 6904 + }, + { + "epoch": 1.8672255273120606, + "grad_norm": 3.375, + "learning_rate": 0.014984809943894855, + "loss": 3.4121, + "mean_token_accuracy": 0.3765793442726135, + "num_tokens": 3530016955.0, + "step": 6905 + }, + { + "epoch": 1.8674959437533802, + "grad_norm": 2.6875, + "learning_rate": 0.014983396336631797, + "loss": 3.3218, + "mean_token_accuracy": 0.39357301592826843, + "num_tokens": 3530504649.0, + "step": 6906 + }, + { + "epoch": 1.8677663601946999, + "grad_norm": 3.65625, + "learning_rate": 0.014981982607146423, + "loss": 3.3644, + "mean_token_accuracy": 0.3938625454902649, + "num_tokens": 3530930707.0, + "step": 6907 + }, + { + "epoch": 1.8680367766360195, + "grad_norm": 2.84375, + "learning_rate": 0.014980568755482114, + "loss": 3.3974, + "mean_token_accuracy": 0.39286357164382935, + "num_tokens": 3531378191.0, + "step": 6908 + }, + { + "epoch": 1.8683071930773392, + "grad_norm": 3.203125, + "learning_rate": 0.01497915478168225, + "loss": 3.5467, + "mean_token_accuracy": 0.35896775126457214, + "num_tokens": 3531889855.0, + "step": 6909 + }, + { + "epoch": 1.8685776095186588, + "grad_norm": 3.046875, + "learning_rate": 0.014977740685790218, + "loss": 3.3068, + "mean_token_accuracy": 0.4120897650718689, + "num_tokens": 3532387614.0, + "step": 6910 + }, + { + "epoch": 1.8688480259599785, + "grad_norm": 5.53125, + "learning_rate": 0.0149763264678494, + "loss": 9.1494, + "mean_token_accuracy": 0.04009951651096344, + "num_tokens": 3532890898.0, + "step": 6911 + }, + { + "epoch": 1.869118442401298, + "grad_norm": 9.6875, + "learning_rate": 0.014974912127903193, + "loss": 3.7298, + "mean_token_accuracy": 0.3225694000720978, + "num_tokens": 3533415123.0, + "step": 6912 + }, + { + "epoch": 1.8693888588426177, + "grad_norm": 2.75, + "learning_rate": 0.014973497665994996, + "loss": 3.4415, + "mean_token_accuracy": 0.3687375485897064, + "num_tokens": 3533893587.0, + "step": 6913 + }, + { + "epoch": 1.8696592752839374, + "grad_norm": 2.78125, + "learning_rate": 0.0149720830821682, + "loss": 3.2427, + "mean_token_accuracy": 0.3813996911048889, + "num_tokens": 3534417728.0, + "step": 6914 + }, + { + "epoch": 1.869929691725257, + "grad_norm": 3.703125, + "learning_rate": 0.014970668376466214, + "loss": 3.3786, + "mean_token_accuracy": 0.3613004684448242, + "num_tokens": 3534941998.0, + "step": 6915 + }, + { + "epoch": 1.8702001081665767, + "grad_norm": 3.453125, + "learning_rate": 0.014969253548932452, + "loss": 3.4595, + "mean_token_accuracy": 0.36995577812194824, + "num_tokens": 3535466278.0, + "step": 6916 + }, + { + "epoch": 1.8704705246078963, + "grad_norm": 2.546875, + "learning_rate": 0.01496783859961031, + "loss": 3.4746, + "mean_token_accuracy": 0.3849971294403076, + "num_tokens": 3535955478.0, + "step": 6917 + }, + { + "epoch": 1.870740941049216, + "grad_norm": 2.71875, + "learning_rate": 0.014966423528543216, + "loss": 3.1492, + "mean_token_accuracy": 0.4100482761859894, + "num_tokens": 3536459021.0, + "step": 6918 + }, + { + "epoch": 1.8710113574905354, + "grad_norm": 2.609375, + "learning_rate": 0.014965008335774584, + "loss": 3.2981, + "mean_token_accuracy": 0.38983842730522156, + "num_tokens": 3536966391.0, + "step": 6919 + }, + { + "epoch": 1.871281773931855, + "grad_norm": 3.375, + "learning_rate": 0.014963593021347834, + "loss": 3.2115, + "mean_token_accuracy": 0.3726135790348053, + "num_tokens": 3537490635.0, + "step": 6920 + }, + { + "epoch": 1.8715521903731747, + "grad_norm": 2.34375, + "learning_rate": 0.014962177585306395, + "loss": 3.0523, + "mean_token_accuracy": 0.3956621587276459, + "num_tokens": 3538014671.0, + "step": 6921 + }, + { + "epoch": 1.8718226068144943, + "grad_norm": 2.703125, + "learning_rate": 0.014960762027693694, + "loss": 3.1012, + "mean_token_accuracy": 0.40419071912765503, + "num_tokens": 3538538825.0, + "step": 6922 + }, + { + "epoch": 1.872093023255814, + "grad_norm": 2.390625, + "learning_rate": 0.014959346348553167, + "loss": 3.2037, + "mean_token_accuracy": 0.384416401386261, + "num_tokens": 3539062954.0, + "step": 6923 + }, + { + "epoch": 1.8723634396971336, + "grad_norm": 2.78125, + "learning_rate": 0.014957930547928251, + "loss": 3.134, + "mean_token_accuracy": 0.4065304398536682, + "num_tokens": 3539535900.0, + "step": 6924 + }, + { + "epoch": 1.8726338561384532, + "grad_norm": 2.8125, + "learning_rate": 0.014956514625862385, + "loss": 3.2344, + "mean_token_accuracy": 0.4278515577316284, + "num_tokens": 3539995339.0, + "step": 6925 + }, + { + "epoch": 1.8729042725797729, + "grad_norm": 2.859375, + "learning_rate": 0.014955098582399018, + "loss": 3.1602, + "mean_token_accuracy": 0.36391007900238037, + "num_tokens": 3540519441.0, + "step": 6926 + }, + { + "epoch": 1.8731746890210925, + "grad_norm": 2.453125, + "learning_rate": 0.014953682417581594, + "loss": 3.1464, + "mean_token_accuracy": 0.40732085704803467, + "num_tokens": 3541043671.0, + "step": 6927 + }, + { + "epoch": 1.8734451054624122, + "grad_norm": 3.109375, + "learning_rate": 0.014952266131453564, + "loss": 3.1548, + "mean_token_accuracy": 0.3687121272087097, + "num_tokens": 3541567950.0, + "step": 6928 + }, + { + "epoch": 1.8737155219037316, + "grad_norm": 2.65625, + "learning_rate": 0.014950849724058388, + "loss": 3.3146, + "mean_token_accuracy": 0.3906676173210144, + "num_tokens": 3542046700.0, + "step": 6929 + }, + { + "epoch": 1.8739859383450512, + "grad_norm": 2.8125, + "learning_rate": 0.014949433195439524, + "loss": 3.1987, + "mean_token_accuracy": 0.39768078923225403, + "num_tokens": 3542570986.0, + "step": 6930 + }, + { + "epoch": 1.8742563547863709, + "grad_norm": 79.0, + "learning_rate": 0.014948016545640432, + "loss": 28.501, + "mean_token_accuracy": 0.03578516095876694, + "num_tokens": 3543064898.0, + "step": 6931 + }, + { + "epoch": 1.8745267712276905, + "grad_norm": 8.125, + "learning_rate": 0.014946599774704585, + "loss": 3.7078, + "mean_token_accuracy": 0.3399399518966675, + "num_tokens": 3543589113.0, + "step": 6932 + }, + { + "epoch": 1.8747971876690102, + "grad_norm": 2.8125, + "learning_rate": 0.014945182882675451, + "loss": 3.3774, + "mean_token_accuracy": 0.40101227164268494, + "num_tokens": 3544051552.0, + "step": 6933 + }, + { + "epoch": 1.8750676041103298, + "grad_norm": 3.640625, + "learning_rate": 0.014943765869596505, + "loss": 3.3515, + "mean_token_accuracy": 0.3678894639015198, + "num_tokens": 3544575670.0, + "step": 6934 + }, + { + "epoch": 1.8753380205516494, + "grad_norm": 3.015625, + "learning_rate": 0.014942348735511223, + "loss": 3.3888, + "mean_token_accuracy": 0.3723140358924866, + "num_tokens": 3545099771.0, + "step": 6935 + }, + { + "epoch": 1.875608436992969, + "grad_norm": 2.96875, + "learning_rate": 0.014940931480463087, + "loss": 3.1233, + "mean_token_accuracy": 0.3862672746181488, + "num_tokens": 3545623994.0, + "step": 6936 + }, + { + "epoch": 1.8758788534342887, + "grad_norm": 3.171875, + "learning_rate": 0.014939514104495583, + "loss": 3.3803, + "mean_token_accuracy": 0.35419559478759766, + "num_tokens": 3546148207.0, + "step": 6937 + }, + { + "epoch": 1.8761492698756084, + "grad_norm": 3.265625, + "learning_rate": 0.014938096607652203, + "loss": 3.2799, + "mean_token_accuracy": 0.3569965958595276, + "num_tokens": 3546672473.0, + "step": 6938 + }, + { + "epoch": 1.876419686316928, + "grad_norm": 2.546875, + "learning_rate": 0.014936678989976438, + "loss": 3.0845, + "mean_token_accuracy": 0.4104769229888916, + "num_tokens": 3547196746.0, + "step": 6939 + }, + { + "epoch": 1.8766901027582477, + "grad_norm": 3.203125, + "learning_rate": 0.014935261251511785, + "loss": 3.3229, + "mean_token_accuracy": 0.3738747835159302, + "num_tokens": 3547720976.0, + "step": 6940 + }, + { + "epoch": 1.8769605191995673, + "grad_norm": 2.765625, + "learning_rate": 0.014933843392301742, + "loss": 3.1408, + "mean_token_accuracy": 0.38202762603759766, + "num_tokens": 3548245220.0, + "step": 6941 + }, + { + "epoch": 1.877230935640887, + "grad_norm": 3.0625, + "learning_rate": 0.014932425412389816, + "loss": 3.344, + "mean_token_accuracy": 0.3884285092353821, + "num_tokens": 3548717112.0, + "step": 6942 + }, + { + "epoch": 1.8775013520822066, + "grad_norm": 2.984375, + "learning_rate": 0.014931007311819513, + "loss": 3.2646, + "mean_token_accuracy": 0.384748637676239, + "num_tokens": 3549241306.0, + "step": 6943 + }, + { + "epoch": 1.8777717685235262, + "grad_norm": 2.734375, + "learning_rate": 0.014929589090634346, + "loss": 3.1924, + "mean_token_accuracy": 0.3803926706314087, + "num_tokens": 3549765475.0, + "step": 6944 + }, + { + "epoch": 1.8780421849648459, + "grad_norm": 2.921875, + "learning_rate": 0.014928170748877829, + "loss": 3.1854, + "mean_token_accuracy": 0.3911903500556946, + "num_tokens": 3550289667.0, + "step": 6945 + }, + { + "epoch": 1.8783126014061655, + "grad_norm": 2.609375, + "learning_rate": 0.014926752286593483, + "loss": 3.199, + "mean_token_accuracy": 0.38842645287513733, + "num_tokens": 3550813919.0, + "step": 6946 + }, + { + "epoch": 1.8785830178474852, + "grad_norm": 2.828125, + "learning_rate": 0.014925333703824829, + "loss": 3.2971, + "mean_token_accuracy": 0.3751835227012634, + "num_tokens": 3551338119.0, + "step": 6947 + }, + { + "epoch": 1.8788534342888048, + "grad_norm": 2.5, + "learning_rate": 0.014923915000615393, + "loss": 3.1852, + "mean_token_accuracy": 0.35845738649368286, + "num_tokens": 3551862260.0, + "step": 6948 + }, + { + "epoch": 1.8791238507301244, + "grad_norm": 2.453125, + "learning_rate": 0.014922496177008705, + "loss": 3.1913, + "mean_token_accuracy": 0.40113499760627747, + "num_tokens": 3552339287.0, + "step": 6949 + }, + { + "epoch": 1.879394267171444, + "grad_norm": 2.78125, + "learning_rate": 0.014921077233048297, + "loss": 3.2805, + "mean_token_accuracy": 0.3890534043312073, + "num_tokens": 3552830322.0, + "step": 6950 + }, + { + "epoch": 1.8796646836127637, + "grad_norm": 160.0, + "learning_rate": 0.01491965816877771, + "loss": 11.1294, + "mean_token_accuracy": 0.025670364499092102, + "num_tokens": 3553354443.0, + "step": 6951 + }, + { + "epoch": 1.8799351000540834, + "grad_norm": 7.28125, + "learning_rate": 0.014918238984240485, + "loss": 3.8519, + "mean_token_accuracy": 0.3147132396697998, + "num_tokens": 3553878584.0, + "step": 6952 + }, + { + "epoch": 1.880205516495403, + "grad_norm": 2.078125, + "learning_rate": 0.014916819679480163, + "loss": 3.2312, + "mean_token_accuracy": 0.3939012885093689, + "num_tokens": 3554402609.0, + "step": 6953 + }, + { + "epoch": 1.8804759329367227, + "grad_norm": 3.21875, + "learning_rate": 0.014915400254540294, + "loss": 3.1711, + "mean_token_accuracy": 0.3843874931335449, + "num_tokens": 3554926770.0, + "step": 6954 + }, + { + "epoch": 1.8807463493780423, + "grad_norm": 3.203125, + "learning_rate": 0.014913980709464434, + "loss": 3.1064, + "mean_token_accuracy": 0.38350626826286316, + "num_tokens": 3555450905.0, + "step": 6955 + }, + { + "epoch": 1.881016765819362, + "grad_norm": 3.15625, + "learning_rate": 0.014912561044296134, + "loss": 3.2091, + "mean_token_accuracy": 0.36324769258499146, + "num_tokens": 3555975111.0, + "step": 6956 + }, + { + "epoch": 1.8812871822606816, + "grad_norm": 2.140625, + "learning_rate": 0.014911141259078954, + "loss": 2.7992, + "mean_token_accuracy": 0.41076818108558655, + "num_tokens": 3556499293.0, + "step": 6957 + }, + { + "epoch": 1.8815575987020012, + "grad_norm": 2.328125, + "learning_rate": 0.014909721353856459, + "loss": 3.3283, + "mean_token_accuracy": 0.42597201466560364, + "num_tokens": 3557023343.0, + "step": 6958 + }, + { + "epoch": 1.8818280151433209, + "grad_norm": 2.96875, + "learning_rate": 0.014908301328672213, + "loss": 3.4328, + "mean_token_accuracy": 0.37874510884284973, + "num_tokens": 3557547440.0, + "step": 6959 + }, + { + "epoch": 1.8820984315846403, + "grad_norm": 3.578125, + "learning_rate": 0.01490688118356979, + "loss": 3.416, + "mean_token_accuracy": 0.3820837140083313, + "num_tokens": 3558071663.0, + "step": 6960 + }, + { + "epoch": 1.88236884802596, + "grad_norm": 3.28125, + "learning_rate": 0.014905460918592763, + "loss": 3.2281, + "mean_token_accuracy": 0.38471078872680664, + "num_tokens": 3558595948.0, + "step": 6961 + }, + { + "epoch": 1.8826392644672796, + "grad_norm": 3.546875, + "learning_rate": 0.014904040533784713, + "loss": 3.2247, + "mean_token_accuracy": 0.401975154876709, + "num_tokens": 3559120175.0, + "step": 6962 + }, + { + "epoch": 1.8829096809085992, + "grad_norm": 2.875, + "learning_rate": 0.014902620029189215, + "loss": 3.1391, + "mean_token_accuracy": 0.3780367076396942, + "num_tokens": 3559644294.0, + "step": 6963 + }, + { + "epoch": 1.8831800973499189, + "grad_norm": 3.140625, + "learning_rate": 0.014901199404849859, + "loss": 3.2664, + "mean_token_accuracy": 0.3746712803840637, + "num_tokens": 3560168456.0, + "step": 6964 + }, + { + "epoch": 1.8834505137912385, + "grad_norm": 2.65625, + "learning_rate": 0.01489977866081023, + "loss": 3.1956, + "mean_token_accuracy": 0.37523359060287476, + "num_tokens": 3560692658.0, + "step": 6965 + }, + { + "epoch": 1.8837209302325582, + "grad_norm": 2.515625, + "learning_rate": 0.014898357797113926, + "loss": 3.272, + "mean_token_accuracy": 0.3900148868560791, + "num_tokens": 3561216849.0, + "step": 6966 + }, + { + "epoch": 1.8839913466738778, + "grad_norm": 3.234375, + "learning_rate": 0.014896936813804537, + "loss": 3.1447, + "mean_token_accuracy": 0.38117146492004395, + "num_tokens": 3561741072.0, + "step": 6967 + }, + { + "epoch": 1.8842617631151974, + "grad_norm": 10.5625, + "learning_rate": 0.01489551571092567, + "loss": 3.1585, + "mean_token_accuracy": 0.38486039638519287, + "num_tokens": 3562265342.0, + "step": 6968 + }, + { + "epoch": 1.884532179556517, + "grad_norm": 2.625, + "learning_rate": 0.014894094488520927, + "loss": 3.3179, + "mean_token_accuracy": 0.3856530785560608, + "num_tokens": 3562789501.0, + "step": 6969 + }, + { + "epoch": 1.8848025959978365, + "grad_norm": 2.390625, + "learning_rate": 0.01489267314663391, + "loss": 3.1472, + "mean_token_accuracy": 0.4074591398239136, + "num_tokens": 3563245326.0, + "step": 6970 + }, + { + "epoch": 1.8850730124391561, + "grad_norm": 6.4375, + "learning_rate": 0.014891251685308233, + "loss": 11.245, + "mean_token_accuracy": 2.5019014628924197e-06, + "num_tokens": 3563769520.0, + "step": 6971 + }, + { + "epoch": 1.8853434288804758, + "grad_norm": 6.9375, + "learning_rate": 0.01488983010458751, + "loss": 3.8416, + "mean_token_accuracy": 0.3383115231990814, + "num_tokens": 3564229021.0, + "step": 6972 + }, + { + "epoch": 1.8856138453217954, + "grad_norm": 2.09375, + "learning_rate": 0.014888408404515361, + "loss": 3.3687, + "mean_token_accuracy": 0.3763850927352905, + "num_tokens": 3564753062.0, + "step": 6973 + }, + { + "epoch": 1.885884261763115, + "grad_norm": 3.21875, + "learning_rate": 0.014886986585135407, + "loss": 3.4742, + "mean_token_accuracy": 0.35827428102493286, + "num_tokens": 3565277267.0, + "step": 6974 + }, + { + "epoch": 1.8861546782044347, + "grad_norm": 3.78125, + "learning_rate": 0.014885564646491276, + "loss": 3.5289, + "mean_token_accuracy": 0.36223840713500977, + "num_tokens": 3565801519.0, + "step": 6975 + }, + { + "epoch": 1.8864250946457544, + "grad_norm": 3.140625, + "learning_rate": 0.014884142588626595, + "loss": 3.1919, + "mean_token_accuracy": 0.39060479402542114, + "num_tokens": 3566325788.0, + "step": 6976 + }, + { + "epoch": 1.886695511087074, + "grad_norm": 3.625, + "learning_rate": 0.014882720411584996, + "loss": 3.5477, + "mean_token_accuracy": 0.32995209097862244, + "num_tokens": 3566849970.0, + "step": 6977 + }, + { + "epoch": 1.8869659275283936, + "grad_norm": 2.5625, + "learning_rate": 0.014881298115410113, + "loss": 3.1644, + "mean_token_accuracy": 0.40481066703796387, + "num_tokens": 3567338283.0, + "step": 6978 + }, + { + "epoch": 1.8872363439697133, + "grad_norm": 3.078125, + "learning_rate": 0.014879875700145593, + "loss": 3.374, + "mean_token_accuracy": 0.3676791787147522, + "num_tokens": 3567862549.0, + "step": 6979 + }, + { + "epoch": 1.887506760411033, + "grad_norm": 2.546875, + "learning_rate": 0.014878453165835075, + "loss": 3.1979, + "mean_token_accuracy": 0.40113747119903564, + "num_tokens": 3568374678.0, + "step": 6980 + }, + { + "epoch": 1.8877771768523526, + "grad_norm": 2.8125, + "learning_rate": 0.01487703051252221, + "loss": 3.1641, + "mean_token_accuracy": 0.38415342569351196, + "num_tokens": 3568898869.0, + "step": 6981 + }, + { + "epoch": 1.8880475932936722, + "grad_norm": 3.140625, + "learning_rate": 0.014875607740250648, + "loss": 3.1631, + "mean_token_accuracy": 0.39972394704818726, + "num_tokens": 3569422872.0, + "step": 6982 + }, + { + "epoch": 1.8883180097349919, + "grad_norm": 2.859375, + "learning_rate": 0.014874184849064038, + "loss": 3.1631, + "mean_token_accuracy": 0.40002161264419556, + "num_tokens": 3569902868.0, + "step": 6983 + }, + { + "epoch": 1.8885884261763115, + "grad_norm": 4.125, + "learning_rate": 0.014872761839006049, + "loss": 3.0647, + "mean_token_accuracy": 0.38085490465164185, + "num_tokens": 3570427009.0, + "step": 6984 + }, + { + "epoch": 1.8888588426176312, + "grad_norm": 2.40625, + "learning_rate": 0.014871338710120335, + "loss": 3.027, + "mean_token_accuracy": 0.4536503255367279, + "num_tokens": 3570825650.0, + "step": 6985 + }, + { + "epoch": 1.8891292590589508, + "grad_norm": 3.34375, + "learning_rate": 0.014869915462450565, + "loss": 2.9567, + "mean_token_accuracy": 0.4200857877731323, + "num_tokens": 3571320850.0, + "step": 6986 + }, + { + "epoch": 1.8893996755002704, + "grad_norm": 2.390625, + "learning_rate": 0.014868492096040407, + "loss": 3.1447, + "mean_token_accuracy": 0.3947345018386841, + "num_tokens": 3571840274.0, + "step": 6987 + }, + { + "epoch": 1.88967009194159, + "grad_norm": 3.140625, + "learning_rate": 0.014867068610933537, + "loss": 3.023, + "mean_token_accuracy": 0.40459781885147095, + "num_tokens": 3572364515.0, + "step": 6988 + }, + { + "epoch": 1.8899405083829097, + "grad_norm": 3.25, + "learning_rate": 0.01486564500717363, + "loss": 3.1, + "mean_token_accuracy": 0.3725845217704773, + "num_tokens": 3572869408.0, + "step": 6989 + }, + { + "epoch": 1.8902109248242294, + "grad_norm": 2.296875, + "learning_rate": 0.014864221284804363, + "loss": 3.2896, + "mean_token_accuracy": 0.38962605595588684, + "num_tokens": 3573393587.0, + "step": 6990 + }, + { + "epoch": 1.890481341265549, + "grad_norm": 123.5, + "learning_rate": 0.014862797443869424, + "loss": 19.3362, + "mean_token_accuracy": 0.01921427622437477, + "num_tokens": 3573917847.0, + "step": 6991 + }, + { + "epoch": 1.8907517577068687, + "grad_norm": 7.0, + "learning_rate": 0.014861373484412502, + "loss": 3.7192, + "mean_token_accuracy": 0.3201723098754883, + "num_tokens": 3574442089.0, + "step": 6992 + }, + { + "epoch": 1.8910221741481883, + "grad_norm": 2.703125, + "learning_rate": 0.014859949406477285, + "loss": 3.3778, + "mean_token_accuracy": 0.3725544810295105, + "num_tokens": 3574966310.0, + "step": 6993 + }, + { + "epoch": 1.891292590589508, + "grad_norm": 4.1875, + "learning_rate": 0.014858525210107467, + "loss": 3.3601, + "mean_token_accuracy": 0.36464032530784607, + "num_tokens": 3575452528.0, + "step": 6994 + }, + { + "epoch": 1.8915630070308276, + "grad_norm": 3.09375, + "learning_rate": 0.01485710089534675, + "loss": 3.3127, + "mean_token_accuracy": 0.3821142315864563, + "num_tokens": 3575976733.0, + "step": 6995 + }, + { + "epoch": 1.8918334234721472, + "grad_norm": 3.578125, + "learning_rate": 0.014855676462238836, + "loss": 3.5531, + "mean_token_accuracy": 0.3589232265949249, + "num_tokens": 3576454639.0, + "step": 6996 + }, + { + "epoch": 1.8921038399134669, + "grad_norm": 2.109375, + "learning_rate": 0.014854251910827428, + "loss": 3.0162, + "mean_token_accuracy": 0.3947453796863556, + "num_tokens": 3576978912.0, + "step": 6997 + }, + { + "epoch": 1.8923742563547865, + "grad_norm": 3.09375, + "learning_rate": 0.014852827241156235, + "loss": 3.2499, + "mean_token_accuracy": 0.41231268644332886, + "num_tokens": 3577443908.0, + "step": 6998 + }, + { + "epoch": 1.8926446727961062, + "grad_norm": 2.203125, + "learning_rate": 0.014851402453268974, + "loss": 3.0186, + "mean_token_accuracy": 0.40022170543670654, + "num_tokens": 3577951982.0, + "step": 6999 + }, + { + "epoch": 1.8929150892374258, + "grad_norm": 2.59375, + "learning_rate": 0.014849977547209356, + "loss": 3.171, + "mean_token_accuracy": 0.38962438702583313, + "num_tokens": 3578446734.0, + "step": 7000 + }, + { + "epoch": 1.8931855056787452, + "grad_norm": 2.25, + "learning_rate": 0.014848552523021107, + "loss": 3.1558, + "mean_token_accuracy": 0.4156680107116699, + "num_tokens": 3578970924.0, + "step": 7001 + }, + { + "epoch": 1.8934559221200649, + "grad_norm": 2.984375, + "learning_rate": 0.014847127380747954, + "loss": 3.1882, + "mean_token_accuracy": 0.3993200957775116, + "num_tokens": 3579456148.0, + "step": 7002 + }, + { + "epoch": 1.8937263385613845, + "grad_norm": 2.984375, + "learning_rate": 0.01484570212043361, + "loss": 3.392, + "mean_token_accuracy": 0.3809756934642792, + "num_tokens": 3579980415.0, + "step": 7003 + }, + { + "epoch": 1.8939967550027041, + "grad_norm": 4.125, + "learning_rate": 0.014844276742121822, + "loss": 3.5463, + "mean_token_accuracy": 0.34931451082229614, + "num_tokens": 3580504673.0, + "step": 7004 + }, + { + "epoch": 1.8942671714440238, + "grad_norm": 2.3125, + "learning_rate": 0.014842851245856317, + "loss": 3.2484, + "mean_token_accuracy": 0.3972427248954773, + "num_tokens": 3581028930.0, + "step": 7005 + }, + { + "epoch": 1.8945375878853434, + "grad_norm": 3.359375, + "learning_rate": 0.014841425631680834, + "loss": 3.2816, + "mean_token_accuracy": 0.3708834648132324, + "num_tokens": 3581552991.0, + "step": 7006 + }, + { + "epoch": 1.894808004326663, + "grad_norm": 2.21875, + "learning_rate": 0.014839999899639114, + "loss": 3.0973, + "mean_token_accuracy": 0.3995545506477356, + "num_tokens": 3582077119.0, + "step": 7007 + }, + { + "epoch": 1.8950784207679827, + "grad_norm": 3.203125, + "learning_rate": 0.014838574049774907, + "loss": 3.2614, + "mean_token_accuracy": 0.37522056698799133, + "num_tokens": 3582567451.0, + "step": 7008 + }, + { + "epoch": 1.8953488372093024, + "grad_norm": 2.421875, + "learning_rate": 0.014837148082131957, + "loss": 3.1508, + "mean_token_accuracy": 0.3956773281097412, + "num_tokens": 3583091643.0, + "step": 7009 + }, + { + "epoch": 1.895619253650622, + "grad_norm": 3.03125, + "learning_rate": 0.014835721996754021, + "loss": 3.1969, + "mean_token_accuracy": 0.3852074444293976, + "num_tokens": 3583615819.0, + "step": 7010 + }, + { + "epoch": 1.8958896700919414, + "grad_norm": 342.0, + "learning_rate": 0.014834295793684855, + "loss": 22.9062, + "mean_token_accuracy": 0.0008141631842590868, + "num_tokens": 3584140031.0, + "step": 7011 + }, + { + "epoch": 1.896160086533261, + "grad_norm": 7.46875, + "learning_rate": 0.014832869472968219, + "loss": 3.6658, + "mean_token_accuracy": 0.377690851688385, + "num_tokens": 3584658908.0, + "step": 7012 + }, + { + "epoch": 1.8964305029745807, + "grad_norm": 2.453125, + "learning_rate": 0.014831443034647873, + "loss": 3.1884, + "mean_token_accuracy": 0.37602612376213074, + "num_tokens": 3585183188.0, + "step": 7013 + }, + { + "epoch": 1.8967009194159004, + "grad_norm": 2.265625, + "learning_rate": 0.014830016478767588, + "loss": 3.1073, + "mean_token_accuracy": 0.37510502338409424, + "num_tokens": 3585707468.0, + "step": 7014 + }, + { + "epoch": 1.89697133585722, + "grad_norm": 3.0, + "learning_rate": 0.014828589805371134, + "loss": 3.3119, + "mean_token_accuracy": 0.36380907893180847, + "num_tokens": 3586231661.0, + "step": 7015 + }, + { + "epoch": 1.8972417522985396, + "grad_norm": 3.625, + "learning_rate": 0.014827163014502287, + "loss": 3.2348, + "mean_token_accuracy": 0.41127538681030273, + "num_tokens": 3586723496.0, + "step": 7016 + }, + { + "epoch": 1.8975121687398593, + "grad_norm": 3.5625, + "learning_rate": 0.014825736106204822, + "loss": 3.4159, + "mean_token_accuracy": 0.37337327003479004, + "num_tokens": 3587247776.0, + "step": 7017 + }, + { + "epoch": 1.897782585181179, + "grad_norm": 3.578125, + "learning_rate": 0.014824309080522525, + "loss": 3.1202, + "mean_token_accuracy": 0.39629101753234863, + "num_tokens": 3587750868.0, + "step": 7018 + }, + { + "epoch": 1.8980530016224986, + "grad_norm": 2.890625, + "learning_rate": 0.014822881937499175, + "loss": 3.0477, + "mean_token_accuracy": 0.3918147087097168, + "num_tokens": 3588274990.0, + "step": 7019 + }, + { + "epoch": 1.8983234180638182, + "grad_norm": 3.640625, + "learning_rate": 0.014821454677178568, + "loss": 3.3746, + "mean_token_accuracy": 0.3858357071876526, + "num_tokens": 3588743521.0, + "step": 7020 + }, + { + "epoch": 1.8985938345051379, + "grad_norm": 3.6875, + "learning_rate": 0.01482002729960449, + "loss": 3.4699, + "mean_token_accuracy": 0.3845999538898468, + "num_tokens": 3589267789.0, + "step": 7021 + }, + { + "epoch": 1.8988642509464575, + "grad_norm": 2.796875, + "learning_rate": 0.014818599804820745, + "loss": 3.3342, + "mean_token_accuracy": 0.39608633518218994, + "num_tokens": 3589752994.0, + "step": 7022 + }, + { + "epoch": 1.8991346673877771, + "grad_norm": 3.21875, + "learning_rate": 0.014817172192871125, + "loss": 3.3495, + "mean_token_accuracy": 0.39129674434661865, + "num_tokens": 3590244411.0, + "step": 7023 + }, + { + "epoch": 1.8994050838290968, + "grad_norm": 2.171875, + "learning_rate": 0.014815744463799437, + "loss": 3.0208, + "mean_token_accuracy": 0.4129611849784851, + "num_tokens": 3590768601.0, + "step": 7024 + }, + { + "epoch": 1.8996755002704164, + "grad_norm": 2.046875, + "learning_rate": 0.014814316617649488, + "loss": 3.1214, + "mean_token_accuracy": 0.4672403633594513, + "num_tokens": 3591181403.0, + "step": 7025 + }, + { + "epoch": 1.899945916711736, + "grad_norm": 3.203125, + "learning_rate": 0.01481288865446509, + "loss": 3.3466, + "mean_token_accuracy": 0.3684537410736084, + "num_tokens": 3591705550.0, + "step": 7026 + }, + { + "epoch": 1.9002163331530557, + "grad_norm": 3.171875, + "learning_rate": 0.014811460574290052, + "loss": 3.098, + "mean_token_accuracy": 0.39131543040275574, + "num_tokens": 3592229810.0, + "step": 7027 + }, + { + "epoch": 1.9004867495943754, + "grad_norm": 2.4375, + "learning_rate": 0.014810032377168195, + "loss": 3.1158, + "mean_token_accuracy": 0.39402782917022705, + "num_tokens": 3592753954.0, + "step": 7028 + }, + { + "epoch": 1.900757166035695, + "grad_norm": 2.765625, + "learning_rate": 0.01480860406314334, + "loss": 3.2862, + "mean_token_accuracy": 0.3829621374607086, + "num_tokens": 3593278124.0, + "step": 7029 + }, + { + "epoch": 1.9010275824770146, + "grad_norm": 2.75, + "learning_rate": 0.014807175632259313, + "loss": 3.3032, + "mean_token_accuracy": 0.37798449397087097, + "num_tokens": 3593802309.0, + "step": 7030 + }, + { + "epoch": 1.9012979989183343, + "grad_norm": 16.5, + "learning_rate": 0.014805747084559943, + "loss": 12.0737, + "mean_token_accuracy": 3.319854658911936e-05, + "num_tokens": 3594326355.0, + "step": 7031 + }, + { + "epoch": 1.901568415359654, + "grad_norm": 5.5, + "learning_rate": 0.014804318420089058, + "loss": 3.7104, + "mean_token_accuracy": 0.3150331974029541, + "num_tokens": 3594850605.0, + "step": 7032 + }, + { + "epoch": 1.9018388318009736, + "grad_norm": 2.125, + "learning_rate": 0.014802889638890497, + "loss": 3.0413, + "mean_token_accuracy": 0.39595720171928406, + "num_tokens": 3595322165.0, + "step": 7033 + }, + { + "epoch": 1.9021092482422932, + "grad_norm": 2.484375, + "learning_rate": 0.0148014607410081, + "loss": 3.2663, + "mean_token_accuracy": 0.4034906029701233, + "num_tokens": 3595846369.0, + "step": 7034 + }, + { + "epoch": 1.9023796646836129, + "grad_norm": 3.09375, + "learning_rate": 0.014800031726485705, + "loss": 3.4237, + "mean_token_accuracy": 0.4139268100261688, + "num_tokens": 3596263414.0, + "step": 7035 + }, + { + "epoch": 1.9026500811249325, + "grad_norm": 2.734375, + "learning_rate": 0.014798602595367163, + "loss": 3.3256, + "mean_token_accuracy": 0.3879544138908386, + "num_tokens": 3596787521.0, + "step": 7036 + }, + { + "epoch": 1.9029204975662521, + "grad_norm": 2.3125, + "learning_rate": 0.01479717334769632, + "loss": 3.1232, + "mean_token_accuracy": 0.3982842266559601, + "num_tokens": 3597311693.0, + "step": 7037 + }, + { + "epoch": 1.9031909140075718, + "grad_norm": 4.0, + "learning_rate": 0.014795743983517035, + "loss": 3.2606, + "mean_token_accuracy": 0.3914707601070404, + "num_tokens": 3597829247.0, + "step": 7038 + }, + { + "epoch": 1.9034613304488914, + "grad_norm": 2.984375, + "learning_rate": 0.014794314502873163, + "loss": 3.1043, + "mean_token_accuracy": 0.41188013553619385, + "num_tokens": 3598353505.0, + "step": 7039 + }, + { + "epoch": 1.903731746890211, + "grad_norm": 3.109375, + "learning_rate": 0.014792884905808561, + "loss": 3.2088, + "mean_token_accuracy": 0.39032262563705444, + "num_tokens": 3598877682.0, + "step": 7040 + }, + { + "epoch": 1.9040021633315307, + "grad_norm": 2.40625, + "learning_rate": 0.014791455192367098, + "loss": 3.1443, + "mean_token_accuracy": 0.41381919384002686, + "num_tokens": 3599390461.0, + "step": 7041 + }, + { + "epoch": 1.9042725797728501, + "grad_norm": 3.59375, + "learning_rate": 0.014790025362592637, + "loss": 3.3021, + "mean_token_accuracy": 0.385869562625885, + "num_tokens": 3599914731.0, + "step": 7042 + }, + { + "epoch": 1.9045429962141698, + "grad_norm": 2.65625, + "learning_rate": 0.014788595416529054, + "loss": 3.2743, + "mean_token_accuracy": 0.39189228415489197, + "num_tokens": 3600439002.0, + "step": 7043 + }, + { + "epoch": 1.9048134126554894, + "grad_norm": 2.796875, + "learning_rate": 0.014787165354220224, + "loss": 3.1378, + "mean_token_accuracy": 0.37372100353240967, + "num_tokens": 3600963208.0, + "step": 7044 + }, + { + "epoch": 1.905083829096809, + "grad_norm": 2.921875, + "learning_rate": 0.01478573517571002, + "loss": 3.3074, + "mean_token_accuracy": 0.3704986274242401, + "num_tokens": 3601487426.0, + "step": 7045 + }, + { + "epoch": 1.9053542455381287, + "grad_norm": 3.09375, + "learning_rate": 0.01478430488104233, + "loss": 3.1057, + "mean_token_accuracy": 0.41933614015579224, + "num_tokens": 3602011579.0, + "step": 7046 + }, + { + "epoch": 1.9056246619794484, + "grad_norm": 2.875, + "learning_rate": 0.014782874470261036, + "loss": 3.2367, + "mean_token_accuracy": 0.3807397484779358, + "num_tokens": 3602535846.0, + "step": 7047 + }, + { + "epoch": 1.905895078420768, + "grad_norm": 3.265625, + "learning_rate": 0.014781443943410029, + "loss": 3.1759, + "mean_token_accuracy": 0.38929271697998047, + "num_tokens": 3603060120.0, + "step": 7048 + }, + { + "epoch": 1.9061654948620876, + "grad_norm": 2.65625, + "learning_rate": 0.014780013300533202, + "loss": 3.04, + "mean_token_accuracy": 0.4219783544540405, + "num_tokens": 3603530787.0, + "step": 7049 + }, + { + "epoch": 1.9064359113034073, + "grad_norm": 2.96875, + "learning_rate": 0.014778582541674449, + "loss": 3.3838, + "mean_token_accuracy": 0.38201653957366943, + "num_tokens": 3603999149.0, + "step": 7050 + }, + { + "epoch": 1.906706327744727, + "grad_norm": 68.0, + "learning_rate": 0.014777151666877673, + "loss": 11.3061, + "mean_token_accuracy": 0.001143673900514841, + "num_tokens": 3604523399.0, + "step": 7051 + }, + { + "epoch": 1.9069767441860463, + "grad_norm": 7.59375, + "learning_rate": 0.014775720676186774, + "loss": 3.8294, + "mean_token_accuracy": 0.34924185276031494, + "num_tokens": 3605001741.0, + "step": 7052 + }, + { + "epoch": 1.907247160627366, + "grad_norm": 3.15625, + "learning_rate": 0.014774289569645662, + "loss": 3.3594, + "mean_token_accuracy": 0.3709089756011963, + "num_tokens": 3605490782.0, + "step": 7053 + }, + { + "epoch": 1.9075175770686856, + "grad_norm": 3.953125, + "learning_rate": 0.01477285834729825, + "loss": 3.1558, + "mean_token_accuracy": 0.3891754746437073, + "num_tokens": 3605988112.0, + "step": 7054 + }, + { + "epoch": 1.9077879935100053, + "grad_norm": 3.5, + "learning_rate": 0.014771427009188445, + "loss": 3.2778, + "mean_token_accuracy": 0.33380064368247986, + "num_tokens": 3606512274.0, + "step": 7055 + }, + { + "epoch": 1.908058409951325, + "grad_norm": 3.59375, + "learning_rate": 0.014769995555360167, + "loss": 3.6219, + "mean_token_accuracy": 0.37130436301231384, + "num_tokens": 3607036544.0, + "step": 7056 + }, + { + "epoch": 1.9083288263926446, + "grad_norm": 3.546875, + "learning_rate": 0.014768563985857342, + "loss": 3.4877, + "mean_token_accuracy": 0.3726663589477539, + "num_tokens": 3607560761.0, + "step": 7057 + }, + { + "epoch": 1.9085992428339642, + "grad_norm": 2.8125, + "learning_rate": 0.014767132300723889, + "loss": 3.2021, + "mean_token_accuracy": 0.39257383346557617, + "num_tokens": 3608084932.0, + "step": 7058 + }, + { + "epoch": 1.9088696592752838, + "grad_norm": 2.5625, + "learning_rate": 0.01476570050000374, + "loss": 3.4986, + "mean_token_accuracy": 0.3685949146747589, + "num_tokens": 3608609201.0, + "step": 7059 + }, + { + "epoch": 1.9091400757166035, + "grad_norm": 2.921875, + "learning_rate": 0.014764268583740825, + "loss": 3.3463, + "mean_token_accuracy": 0.3874273896217346, + "num_tokens": 3609078281.0, + "step": 7060 + }, + { + "epoch": 1.9094104921579231, + "grad_norm": 2.53125, + "learning_rate": 0.01476283655197908, + "loss": 3.1646, + "mean_token_accuracy": 0.3917456865310669, + "num_tokens": 3609602450.0, + "step": 7061 + }, + { + "epoch": 1.9096809085992428, + "grad_norm": 2.3125, + "learning_rate": 0.014761404404762445, + "loss": 3.122, + "mean_token_accuracy": 0.39243143796920776, + "num_tokens": 3610126647.0, + "step": 7062 + }, + { + "epoch": 1.9099513250405624, + "grad_norm": 2.53125, + "learning_rate": 0.014759972142134859, + "loss": 3.1547, + "mean_token_accuracy": 0.3801632523536682, + "num_tokens": 3610650893.0, + "step": 7063 + }, + { + "epoch": 1.910221741481882, + "grad_norm": 2.5, + "learning_rate": 0.014758539764140268, + "loss": 2.9955, + "mean_token_accuracy": 0.3965598940849304, + "num_tokens": 3611119272.0, + "step": 7064 + }, + { + "epoch": 1.9104921579232017, + "grad_norm": 2.40625, + "learning_rate": 0.01475710727082263, + "loss": 3.2591, + "mean_token_accuracy": 0.3970983624458313, + "num_tokens": 3611588514.0, + "step": 7065 + }, + { + "epoch": 1.9107625743645213, + "grad_norm": 2.4375, + "learning_rate": 0.014755674662225887, + "loss": 2.7796, + "mean_token_accuracy": 0.42330920696258545, + "num_tokens": 3612064014.0, + "step": 7066 + }, + { + "epoch": 1.911032990805841, + "grad_norm": 2.9375, + "learning_rate": 0.014754241938394004, + "loss": 3.0752, + "mean_token_accuracy": 0.401187539100647, + "num_tokens": 3612543546.0, + "step": 7067 + }, + { + "epoch": 1.9113034072471606, + "grad_norm": 2.875, + "learning_rate": 0.014752809099370935, + "loss": 3.1381, + "mean_token_accuracy": 0.40541934967041016, + "num_tokens": 3613013896.0, + "step": 7068 + }, + { + "epoch": 1.9115738236884803, + "grad_norm": 3.078125, + "learning_rate": 0.014751376145200648, + "loss": 3.231, + "mean_token_accuracy": 0.3940936326980591, + "num_tokens": 3613538166.0, + "step": 7069 + }, + { + "epoch": 1.9118442401298, + "grad_norm": 3.265625, + "learning_rate": 0.01474994307592711, + "loss": 3.1904, + "mean_token_accuracy": 0.3977805972099304, + "num_tokens": 3614062440.0, + "step": 7070 + }, + { + "epoch": 1.9121146565711196, + "grad_norm": 36.25, + "learning_rate": 0.014748509891594288, + "loss": 11.8186, + "mean_token_accuracy": 0.02783745899796486, + "num_tokens": 3614529424.0, + "step": 7071 + }, + { + "epoch": 1.9123850730124392, + "grad_norm": 5.28125, + "learning_rate": 0.01474707659224616, + "loss": 3.5987, + "mean_token_accuracy": 0.3535793125629425, + "num_tokens": 3614998268.0, + "step": 7072 + }, + { + "epoch": 1.9126554894537589, + "grad_norm": 2.3125, + "learning_rate": 0.014745643177926704, + "loss": 3.5045, + "mean_token_accuracy": 0.368977427482605, + "num_tokens": 3615522547.0, + "step": 7073 + }, + { + "epoch": 1.9129259058950785, + "grad_norm": 3.203125, + "learning_rate": 0.014744209648679901, + "loss": 3.3951, + "mean_token_accuracy": 0.3643506169319153, + "num_tokens": 3616046790.0, + "step": 7074 + }, + { + "epoch": 1.9131963223363981, + "grad_norm": 3.265625, + "learning_rate": 0.014742776004549731, + "loss": 3.3772, + "mean_token_accuracy": 0.38183653354644775, + "num_tokens": 3616549300.0, + "step": 7075 + }, + { + "epoch": 1.9134667387777178, + "grad_norm": 2.78125, + "learning_rate": 0.014741342245580187, + "loss": 3.2882, + "mean_token_accuracy": 0.3802313804626465, + "num_tokens": 3617073497.0, + "step": 7076 + }, + { + "epoch": 1.9137371552190374, + "grad_norm": 3.4375, + "learning_rate": 0.014739908371815262, + "loss": 3.1227, + "mean_token_accuracy": 0.3859172463417053, + "num_tokens": 3617597677.0, + "step": 7077 + }, + { + "epoch": 1.914007571660357, + "grad_norm": 2.5, + "learning_rate": 0.014738474383298951, + "loss": 3.2613, + "mean_token_accuracy": 0.37991294264793396, + "num_tokens": 3618121880.0, + "step": 7078 + }, + { + "epoch": 1.9142779881016767, + "grad_norm": 3.75, + "learning_rate": 0.01473704028007525, + "loss": 3.0384, + "mean_token_accuracy": 0.3667624592781067, + "num_tokens": 3618646135.0, + "step": 7079 + }, + { + "epoch": 1.9145484045429964, + "grad_norm": 2.25, + "learning_rate": 0.014735606062188163, + "loss": 3.1172, + "mean_token_accuracy": 0.38968175649642944, + "num_tokens": 3619170282.0, + "step": 7080 + }, + { + "epoch": 1.914818820984316, + "grad_norm": 3.5625, + "learning_rate": 0.014734171729681698, + "loss": 3.263, + "mean_token_accuracy": 0.3821977972984314, + "num_tokens": 3619694483.0, + "step": 7081 + }, + { + "epoch": 1.9150892374256356, + "grad_norm": 2.96875, + "learning_rate": 0.014732737282599861, + "loss": 3.265, + "mean_token_accuracy": 0.3934588134288788, + "num_tokens": 3620218729.0, + "step": 7082 + }, + { + "epoch": 1.915359653866955, + "grad_norm": 3.171875, + "learning_rate": 0.014731302720986668, + "loss": 3.1679, + "mean_token_accuracy": 0.399345725774765, + "num_tokens": 3620742902.0, + "step": 7083 + }, + { + "epoch": 1.9156300703082747, + "grad_norm": 3.078125, + "learning_rate": 0.014729868044886137, + "loss": 3.178, + "mean_token_accuracy": 0.4036925733089447, + "num_tokens": 3621267033.0, + "step": 7084 + }, + { + "epoch": 1.9159004867495943, + "grad_norm": 2.65625, + "learning_rate": 0.014728433254342279, + "loss": 3.1382, + "mean_token_accuracy": 0.34434494376182556, + "num_tokens": 3621791175.0, + "step": 7085 + }, + { + "epoch": 1.916170903190914, + "grad_norm": 2.390625, + "learning_rate": 0.014726998349399128, + "loss": 3.2468, + "mean_token_accuracy": 0.38422951102256775, + "num_tokens": 3622315443.0, + "step": 7086 + }, + { + "epoch": 1.9164413196322336, + "grad_norm": 3.453125, + "learning_rate": 0.014725563330100708, + "loss": 3.238, + "mean_token_accuracy": 0.4017700254917145, + "num_tokens": 3622796664.0, + "step": 7087 + }, + { + "epoch": 1.9167117360735533, + "grad_norm": 2.921875, + "learning_rate": 0.014724128196491047, + "loss": 3.3544, + "mean_token_accuracy": 0.3892507255077362, + "num_tokens": 3623320816.0, + "step": 7088 + }, + { + "epoch": 1.916982152514873, + "grad_norm": 4.125, + "learning_rate": 0.014722692948614183, + "loss": 3.4377, + "mean_token_accuracy": 0.37178003787994385, + "num_tokens": 3623845083.0, + "step": 7089 + }, + { + "epoch": 1.9172525689561926, + "grad_norm": 2.546875, + "learning_rate": 0.014721257586514151, + "loss": 3.146, + "mean_token_accuracy": 0.4023694396018982, + "num_tokens": 3624369286.0, + "step": 7090 + }, + { + "epoch": 1.9175229853975122, + "grad_norm": 87.5, + "learning_rate": 0.014719822110234993, + "loss": 43.0867, + "mean_token_accuracy": 0.0, + "num_tokens": 3624893473.0, + "step": 7091 + }, + { + "epoch": 1.9177934018388318, + "grad_norm": 9.4375, + "learning_rate": 0.01471838651982075, + "loss": 3.7, + "mean_token_accuracy": 0.328006386756897, + "num_tokens": 3625417705.0, + "step": 7092 + }, + { + "epoch": 1.9180638182801513, + "grad_norm": 2.140625, + "learning_rate": 0.014716950815315474, + "loss": 3.3236, + "mean_token_accuracy": 0.3405710458755493, + "num_tokens": 3625941927.0, + "step": 7093 + }, + { + "epoch": 1.918334234721471, + "grad_norm": 2.71875, + "learning_rate": 0.014715514996763221, + "loss": 3.5213, + "mean_token_accuracy": 0.3504791259765625, + "num_tokens": 3626466209.0, + "step": 7094 + }, + { + "epoch": 1.9186046511627906, + "grad_norm": 3.59375, + "learning_rate": 0.01471407906420804, + "loss": 3.3501, + "mean_token_accuracy": 0.3810884952545166, + "num_tokens": 3626989751.0, + "step": 7095 + }, + { + "epoch": 1.9188750676041102, + "grad_norm": 2.203125, + "learning_rate": 0.01471264301769399, + "loss": 3.0093, + "mean_token_accuracy": 0.3911064565181732, + "num_tokens": 3627513936.0, + "step": 7096 + }, + { + "epoch": 1.9191454840454298, + "grad_norm": 2.78125, + "learning_rate": 0.014711206857265131, + "loss": 2.9987, + "mean_token_accuracy": 0.40832656621932983, + "num_tokens": 3628038157.0, + "step": 7097 + }, + { + "epoch": 1.9194159004867495, + "grad_norm": 2.40625, + "learning_rate": 0.014709770582965536, + "loss": 3.3241, + "mean_token_accuracy": 0.3882015347480774, + "num_tokens": 3628562183.0, + "step": 7098 + }, + { + "epoch": 1.9196863169280691, + "grad_norm": 3.1875, + "learning_rate": 0.014708334194839268, + "loss": 3.3804, + "mean_token_accuracy": 0.38942405581474304, + "num_tokens": 3629086450.0, + "step": 7099 + }, + { + "epoch": 1.9199567333693888, + "grad_norm": 2.390625, + "learning_rate": 0.0147068976929304, + "loss": 3.2962, + "mean_token_accuracy": 0.3741474151611328, + "num_tokens": 3629610563.0, + "step": 7100 + }, + { + "epoch": 1.9202271498107084, + "grad_norm": 2.515625, + "learning_rate": 0.014705461077283011, + "loss": 2.921, + "mean_token_accuracy": 0.39999857544898987, + "num_tokens": 3630134836.0, + "step": 7101 + }, + { + "epoch": 1.920497566252028, + "grad_norm": 2.859375, + "learning_rate": 0.01470402434794118, + "loss": 3.2354, + "mean_token_accuracy": 0.3952968716621399, + "num_tokens": 3630611083.0, + "step": 7102 + }, + { + "epoch": 1.9207679826933477, + "grad_norm": 2.484375, + "learning_rate": 0.014702587504948985, + "loss": 3.0976, + "mean_token_accuracy": 0.40083855390548706, + "num_tokens": 3631135282.0, + "step": 7103 + }, + { + "epoch": 1.9210383991346673, + "grad_norm": 2.546875, + "learning_rate": 0.014701150548350524, + "loss": 3.1438, + "mean_token_accuracy": 0.41676008701324463, + "num_tokens": 3631612544.0, + "step": 7104 + }, + { + "epoch": 1.921308815575987, + "grad_norm": 3.09375, + "learning_rate": 0.014699713478189876, + "loss": 3.2641, + "mean_token_accuracy": 0.3993692994117737, + "num_tokens": 3632136679.0, + "step": 7105 + }, + { + "epoch": 1.9215792320173066, + "grad_norm": 2.90625, + "learning_rate": 0.014698276294511137, + "loss": 3.1655, + "mean_token_accuracy": 0.4172340929508209, + "num_tokens": 3632658614.0, + "step": 7106 + }, + { + "epoch": 1.9218496484586263, + "grad_norm": 2.96875, + "learning_rate": 0.014696838997358406, + "loss": 3.2166, + "mean_token_accuracy": 0.3901367783546448, + "num_tokens": 3633182794.0, + "step": 7107 + }, + { + "epoch": 1.922120064899946, + "grad_norm": 2.84375, + "learning_rate": 0.014695401586775783, + "loss": 3.2213, + "mean_token_accuracy": 0.37220072746276855, + "num_tokens": 3633696513.0, + "step": 7108 + }, + { + "epoch": 1.9223904813412656, + "grad_norm": 2.734375, + "learning_rate": 0.014693964062807372, + "loss": 3.2617, + "mean_token_accuracy": 0.37756162881851196, + "num_tokens": 3634220773.0, + "step": 7109 + }, + { + "epoch": 1.9226608977825852, + "grad_norm": 2.71875, + "learning_rate": 0.01469252642549728, + "loss": 3.1165, + "mean_token_accuracy": 0.38493430614471436, + "num_tokens": 3634745026.0, + "step": 7110 + }, + { + "epoch": 1.9229313142239048, + "grad_norm": 75.0, + "learning_rate": 0.014691088674889621, + "loss": 12.1952, + "mean_token_accuracy": 0.0013977829366922379, + "num_tokens": 3635269138.0, + "step": 7111 + }, + { + "epoch": 1.9232017306652245, + "grad_norm": 6.5625, + "learning_rate": 0.014689650811028503, + "loss": 3.7195, + "mean_token_accuracy": 0.3551187813282013, + "num_tokens": 3635793355.0, + "step": 7112 + }, + { + "epoch": 1.9234721471065441, + "grad_norm": 2.90625, + "learning_rate": 0.01468821283395805, + "loss": 3.4314, + "mean_token_accuracy": 0.3826441168785095, + "num_tokens": 3636317545.0, + "step": 7113 + }, + { + "epoch": 1.9237425635478638, + "grad_norm": 3.265625, + "learning_rate": 0.01468677474372238, + "loss": 3.1597, + "mean_token_accuracy": 0.37230184674263, + "num_tokens": 3636841769.0, + "step": 7114 + }, + { + "epoch": 1.9240129799891834, + "grad_norm": 3.328125, + "learning_rate": 0.014685336540365617, + "loss": 3.3735, + "mean_token_accuracy": 0.3731979429721832, + "num_tokens": 3637366052.0, + "step": 7115 + }, + { + "epoch": 1.924283396430503, + "grad_norm": 4.34375, + "learning_rate": 0.014683898223931893, + "loss": 3.4108, + "mean_token_accuracy": 0.37543052434921265, + "num_tokens": 3637890331.0, + "step": 7116 + }, + { + "epoch": 1.9245538128718227, + "grad_norm": 4.59375, + "learning_rate": 0.01468245979446534, + "loss": 3.3555, + "mean_token_accuracy": 0.38223278522491455, + "num_tokens": 3638363157.0, + "step": 7117 + }, + { + "epoch": 1.9248242293131423, + "grad_norm": 2.578125, + "learning_rate": 0.014681021252010087, + "loss": 3.3184, + "mean_token_accuracy": 0.3605189919471741, + "num_tokens": 3638887414.0, + "step": 7118 + }, + { + "epoch": 1.925094645754462, + "grad_norm": 2.71875, + "learning_rate": 0.014679582596610277, + "loss": 3.2664, + "mean_token_accuracy": 0.3930581212043762, + "num_tokens": 3639411646.0, + "step": 7119 + }, + { + "epoch": 1.9253650621957816, + "grad_norm": 2.453125, + "learning_rate": 0.014678143828310055, + "loss": 3.1645, + "mean_token_accuracy": 0.39653804898262024, + "num_tokens": 3639888684.0, + "step": 7120 + }, + { + "epoch": 1.9256354786371013, + "grad_norm": 2.890625, + "learning_rate": 0.014676704947153561, + "loss": 3.3024, + "mean_token_accuracy": 0.3562474846839905, + "num_tokens": 3640412889.0, + "step": 7121 + }, + { + "epoch": 1.925905895078421, + "grad_norm": 2.5625, + "learning_rate": 0.01467526595318495, + "loss": 3.1075, + "mean_token_accuracy": 0.3959805369377136, + "num_tokens": 3640937067.0, + "step": 7122 + }, + { + "epoch": 1.9261763115197406, + "grad_norm": 2.671875, + "learning_rate": 0.014673826846448369, + "loss": 3.2275, + "mean_token_accuracy": 0.3865933120250702, + "num_tokens": 3641461270.0, + "step": 7123 + }, + { + "epoch": 1.92644672796106, + "grad_norm": 3.34375, + "learning_rate": 0.014672387626987976, + "loss": 3.2428, + "mean_token_accuracy": 0.3834715187549591, + "num_tokens": 3641985419.0, + "step": 7124 + }, + { + "epoch": 1.9267171444023796, + "grad_norm": 11.5625, + "learning_rate": 0.01467094829484793, + "loss": 3.1535, + "mean_token_accuracy": 0.4112818241119385, + "num_tokens": 3642456215.0, + "step": 7125 + }, + { + "epoch": 1.9269875608436993, + "grad_norm": 2.59375, + "learning_rate": 0.014669508850072396, + "loss": 3.2348, + "mean_token_accuracy": 0.40428397059440613, + "num_tokens": 3642980380.0, + "step": 7126 + }, + { + "epoch": 1.927257977285019, + "grad_norm": 3.140625, + "learning_rate": 0.01466806929270554, + "loss": 3.0715, + "mean_token_accuracy": 0.4200338125228882, + "num_tokens": 3643504667.0, + "step": 7127 + }, + { + "epoch": 1.9275283937263386, + "grad_norm": 4.25, + "learning_rate": 0.014666629622791529, + "loss": 3.2027, + "mean_token_accuracy": 0.3985108733177185, + "num_tokens": 3644002906.0, + "step": 7128 + }, + { + "epoch": 1.9277988101676582, + "grad_norm": 2.109375, + "learning_rate": 0.01466518984037454, + "loss": 3.2271, + "mean_token_accuracy": 0.39050155878067017, + "num_tokens": 3644527124.0, + "step": 7129 + }, + { + "epoch": 1.9280692266089778, + "grad_norm": 3.0625, + "learning_rate": 0.01466374994549875, + "loss": 3.3269, + "mean_token_accuracy": 0.37758392095565796, + "num_tokens": 3645013223.0, + "step": 7130 + }, + { + "epoch": 1.9283396430502975, + "grad_norm": 5.28125, + "learning_rate": 0.014662309938208334, + "loss": 10.5135, + "mean_token_accuracy": 1.513599454483483e-05, + "num_tokens": 3645492526.0, + "step": 7131 + }, + { + "epoch": 1.9286100594916171, + "grad_norm": 5.40625, + "learning_rate": 0.01466086981854748, + "loss": 3.5352, + "mean_token_accuracy": 0.35743236541748047, + "num_tokens": 3646016671.0, + "step": 7132 + }, + { + "epoch": 1.9288804759329368, + "grad_norm": 2.90625, + "learning_rate": 0.014659429586560375, + "loss": 3.4727, + "mean_token_accuracy": 0.3759283423423767, + "num_tokens": 3646540821.0, + "step": 7133 + }, + { + "epoch": 1.9291508923742564, + "grad_norm": 3.125, + "learning_rate": 0.014657989242291208, + "loss": 3.3989, + "mean_token_accuracy": 0.36854472756385803, + "num_tokens": 3647065106.0, + "step": 7134 + }, + { + "epoch": 1.9294213088155758, + "grad_norm": 3.359375, + "learning_rate": 0.014656548785784169, + "loss": 3.4127, + "mean_token_accuracy": 0.3710675835609436, + "num_tokens": 3647589327.0, + "step": 7135 + }, + { + "epoch": 1.9296917252568955, + "grad_norm": 2.78125, + "learning_rate": 0.014655108217083466, + "loss": 3.2869, + "mean_token_accuracy": 0.3973323404788971, + "num_tokens": 3648113567.0, + "step": 7136 + }, + { + "epoch": 1.9299621416982151, + "grad_norm": 3.875, + "learning_rate": 0.014653667536233293, + "loss": 3.2223, + "mean_token_accuracy": 0.3907824158668518, + "num_tokens": 3648637809.0, + "step": 7137 + }, + { + "epoch": 1.9302325581395348, + "grad_norm": 2.96875, + "learning_rate": 0.014652226743277855, + "loss": 3.0433, + "mean_token_accuracy": 0.4207616448402405, + "num_tokens": 3649101018.0, + "step": 7138 + }, + { + "epoch": 1.9305029745808544, + "grad_norm": 3.078125, + "learning_rate": 0.014650785838261356, + "loss": 3.3825, + "mean_token_accuracy": 0.3877241611480713, + "num_tokens": 3649604046.0, + "step": 7139 + }, + { + "epoch": 1.930773391022174, + "grad_norm": 3.375, + "learning_rate": 0.014649344821228016, + "loss": 3.3412, + "mean_token_accuracy": 0.37641996145248413, + "num_tokens": 3650128316.0, + "step": 7140 + }, + { + "epoch": 1.9310438074634937, + "grad_norm": 2.625, + "learning_rate": 0.014647903692222047, + "loss": 3.2278, + "mean_token_accuracy": 0.3978206515312195, + "num_tokens": 3650652443.0, + "step": 7141 + }, + { + "epoch": 1.9313142239048133, + "grad_norm": 2.96875, + "learning_rate": 0.014646462451287657, + "loss": 3.2068, + "mean_token_accuracy": 0.3913089334964752, + "num_tokens": 3651175151.0, + "step": 7142 + }, + { + "epoch": 1.931584640346133, + "grad_norm": 2.578125, + "learning_rate": 0.014645021098469083, + "loss": 3.0917, + "mean_token_accuracy": 0.4004669487476349, + "num_tokens": 3651664939.0, + "step": 7143 + }, + { + "epoch": 1.9318550567874526, + "grad_norm": 3.015625, + "learning_rate": 0.014643579633810536, + "loss": 3.2423, + "mean_token_accuracy": 0.3742232322692871, + "num_tokens": 3652188979.0, + "step": 7144 + }, + { + "epoch": 1.9321254732287723, + "grad_norm": 2.09375, + "learning_rate": 0.014642138057356254, + "loss": 3.155, + "mean_token_accuracy": 0.3987433910369873, + "num_tokens": 3652659956.0, + "step": 7145 + }, + { + "epoch": 1.932395889670092, + "grad_norm": 2.484375, + "learning_rate": 0.014640696369150469, + "loss": 3.1151, + "mean_token_accuracy": 0.39688757061958313, + "num_tokens": 3653184166.0, + "step": 7146 + }, + { + "epoch": 1.9326663061114115, + "grad_norm": 2.890625, + "learning_rate": 0.014639254569237408, + "loss": 3.1499, + "mean_token_accuracy": 0.4003064036369324, + "num_tokens": 3653634863.0, + "step": 7147 + }, + { + "epoch": 1.9329367225527312, + "grad_norm": 2.78125, + "learning_rate": 0.014637812657661314, + "loss": 3.1852, + "mean_token_accuracy": 0.395561158657074, + "num_tokens": 3654158970.0, + "step": 7148 + }, + { + "epoch": 1.9332071389940508, + "grad_norm": 3.15625, + "learning_rate": 0.014636370634466433, + "loss": 3.1242, + "mean_token_accuracy": 0.38896334171295166, + "num_tokens": 3654683198.0, + "step": 7149 + }, + { + "epoch": 1.9334775554353705, + "grad_norm": 3.15625, + "learning_rate": 0.014634928499697004, + "loss": 3.2099, + "mean_token_accuracy": 0.39196598529815674, + "num_tokens": 3655207449.0, + "step": 7150 + }, + { + "epoch": 1.9337479718766901, + "grad_norm": 11.125, + "learning_rate": 0.014633486253397282, + "loss": 13.031, + "mean_token_accuracy": 0.013784321025013924, + "num_tokens": 3655731619.0, + "step": 7151 + }, + { + "epoch": 1.9340183883180098, + "grad_norm": 8.75, + "learning_rate": 0.014632043895611513, + "loss": 3.6678, + "mean_token_accuracy": 0.34035950899124146, + "num_tokens": 3656255743.0, + "step": 7152 + }, + { + "epoch": 1.9342888047593294, + "grad_norm": 2.46875, + "learning_rate": 0.014630601426383958, + "loss": 3.3872, + "mean_token_accuracy": 0.38161563873291016, + "num_tokens": 3656767838.0, + "step": 7153 + }, + { + "epoch": 1.934559221200649, + "grad_norm": 3.296875, + "learning_rate": 0.014629158845758876, + "loss": 3.3335, + "mean_token_accuracy": 0.3918638527393341, + "num_tokens": 3657288660.0, + "step": 7154 + }, + { + "epoch": 1.9348296376419687, + "grad_norm": 2.71875, + "learning_rate": 0.014627716153780525, + "loss": 3.1559, + "mean_token_accuracy": 0.40115851163864136, + "num_tokens": 3657812838.0, + "step": 7155 + }, + { + "epoch": 1.9351000540832883, + "grad_norm": 3.03125, + "learning_rate": 0.014626273350493175, + "loss": 3.1895, + "mean_token_accuracy": 0.38643860816955566, + "num_tokens": 3658337032.0, + "step": 7156 + }, + { + "epoch": 1.935370470524608, + "grad_norm": 2.96875, + "learning_rate": 0.014624830435941094, + "loss": 3.2973, + "mean_token_accuracy": 0.38712215423583984, + "num_tokens": 3658861238.0, + "step": 7157 + }, + { + "epoch": 1.9356408869659276, + "grad_norm": 3.328125, + "learning_rate": 0.014623387410168555, + "loss": 3.3956, + "mean_token_accuracy": 0.3615931272506714, + "num_tokens": 3659385425.0, + "step": 7158 + }, + { + "epoch": 1.9359113034072473, + "grad_norm": 2.65625, + "learning_rate": 0.014621944273219839, + "loss": 3.1588, + "mean_token_accuracy": 0.3941454589366913, + "num_tokens": 3659909552.0, + "step": 7159 + }, + { + "epoch": 1.936181719848567, + "grad_norm": 3.1875, + "learning_rate": 0.014620501025139216, + "loss": 3.1989, + "mean_token_accuracy": 0.3810345530509949, + "num_tokens": 3660433787.0, + "step": 7160 + }, + { + "epoch": 1.9364521362898865, + "grad_norm": 2.921875, + "learning_rate": 0.014619057665970976, + "loss": 3.2436, + "mean_token_accuracy": 0.3772282898426056, + "num_tokens": 3660958044.0, + "step": 7161 + }, + { + "epoch": 1.9367225527312062, + "grad_norm": 2.734375, + "learning_rate": 0.014617614195759406, + "loss": 3.1425, + "mean_token_accuracy": 0.40026184916496277, + "num_tokens": 3661482221.0, + "step": 7162 + }, + { + "epoch": 1.9369929691725258, + "grad_norm": 2.703125, + "learning_rate": 0.01461617061454879, + "loss": 3.1544, + "mean_token_accuracy": 0.40122708678245544, + "num_tokens": 3662006435.0, + "step": 7163 + }, + { + "epoch": 1.9372633856138455, + "grad_norm": 2.65625, + "learning_rate": 0.014614726922383426, + "loss": 3.0593, + "mean_token_accuracy": 0.4046279191970825, + "num_tokens": 3662530665.0, + "step": 7164 + }, + { + "epoch": 1.937533802055165, + "grad_norm": 2.640625, + "learning_rate": 0.01461328311930761, + "loss": 3.2314, + "mean_token_accuracy": 0.3882836103439331, + "num_tokens": 3663054933.0, + "step": 7165 + }, + { + "epoch": 1.9378042184964845, + "grad_norm": 2.921875, + "learning_rate": 0.014611839205365644, + "loss": 3.0582, + "mean_token_accuracy": 0.3873295187950134, + "num_tokens": 3663579065.0, + "step": 7166 + }, + { + "epoch": 1.9380746349378042, + "grad_norm": 2.796875, + "learning_rate": 0.014610395180601827, + "loss": 3.2522, + "mean_token_accuracy": 0.37774941325187683, + "num_tokens": 3664103305.0, + "step": 7167 + }, + { + "epoch": 1.9383450513791238, + "grad_norm": 3.46875, + "learning_rate": 0.01460895104506047, + "loss": 3.0639, + "mean_token_accuracy": 0.39210134744644165, + "num_tokens": 3664627431.0, + "step": 7168 + }, + { + "epoch": 1.9386154678204435, + "grad_norm": 2.734375, + "learning_rate": 0.014607506798785884, + "loss": 3.2939, + "mean_token_accuracy": 0.3903229832649231, + "num_tokens": 3665151600.0, + "step": 7169 + }, + { + "epoch": 1.9388858842617631, + "grad_norm": 3.15625, + "learning_rate": 0.014606062441822373, + "loss": 3.1066, + "mean_token_accuracy": 0.3795294165611267, + "num_tokens": 3665675768.0, + "step": 7170 + }, + { + "epoch": 1.9391563007030828, + "grad_norm": 196.0, + "learning_rate": 0.014604617974214267, + "loss": 14.7853, + "mean_token_accuracy": 0.0012052732054144144, + "num_tokens": 3666199981.0, + "step": 7171 + }, + { + "epoch": 1.9394267171444024, + "grad_norm": 5.1875, + "learning_rate": 0.014603173396005878, + "loss": 3.4068, + "mean_token_accuracy": 0.3674964904785156, + "num_tokens": 3666699089.0, + "step": 7172 + }, + { + "epoch": 1.939697133585722, + "grad_norm": 2.25, + "learning_rate": 0.01460172870724154, + "loss": 2.9709, + "mean_token_accuracy": 0.4066959023475647, + "num_tokens": 3667223212.0, + "step": 7173 + }, + { + "epoch": 1.9399675500270417, + "grad_norm": 2.8125, + "learning_rate": 0.014600283907965566, + "loss": 3.1843, + "mean_token_accuracy": 0.3915776014328003, + "num_tokens": 3667747409.0, + "step": 7174 + }, + { + "epoch": 1.9402379664683613, + "grad_norm": 2.9375, + "learning_rate": 0.014598838998222294, + "loss": 3.1814, + "mean_token_accuracy": 0.3739510774612427, + "num_tokens": 3668271370.0, + "step": 7175 + }, + { + "epoch": 1.9405083829096808, + "grad_norm": 3.109375, + "learning_rate": 0.014597393978056063, + "loss": 3.3718, + "mean_token_accuracy": 0.38764235377311707, + "num_tokens": 3668795577.0, + "step": 7176 + }, + { + "epoch": 1.9407787993510004, + "grad_norm": 2.796875, + "learning_rate": 0.0145959488475112, + "loss": 3.2449, + "mean_token_accuracy": 0.3765049874782562, + "num_tokens": 3669319688.0, + "step": 7177 + }, + { + "epoch": 1.94104921579232, + "grad_norm": 2.890625, + "learning_rate": 0.014594503606632057, + "loss": 3.2226, + "mean_token_accuracy": 0.40085065364837646, + "num_tokens": 3669843846.0, + "step": 7178 + }, + { + "epoch": 1.9413196322336397, + "grad_norm": 2.921875, + "learning_rate": 0.01459305825546297, + "loss": 3.0257, + "mean_token_accuracy": 0.39823493361473083, + "num_tokens": 3670318128.0, + "step": 7179 + }, + { + "epoch": 1.9415900486749593, + "grad_norm": 2.828125, + "learning_rate": 0.01459161279404829, + "loss": 2.8987, + "mean_token_accuracy": 0.4238009452819824, + "num_tokens": 3670835857.0, + "step": 7180 + }, + { + "epoch": 1.941860465116279, + "grad_norm": 2.703125, + "learning_rate": 0.014590167222432365, + "loss": 3.1981, + "mean_token_accuracy": 0.3986325263977051, + "num_tokens": 3671360131.0, + "step": 7181 + }, + { + "epoch": 1.9421308815575986, + "grad_norm": 3.703125, + "learning_rate": 0.014588721540659553, + "loss": 3.4773, + "mean_token_accuracy": 0.3688974976539612, + "num_tokens": 3671884410.0, + "step": 7182 + }, + { + "epoch": 1.9424012979989183, + "grad_norm": 3.109375, + "learning_rate": 0.014587275748774211, + "loss": 3.3465, + "mean_token_accuracy": 0.4068468511104584, + "num_tokens": 3672324041.0, + "step": 7183 + }, + { + "epoch": 1.942671714440238, + "grad_norm": 2.734375, + "learning_rate": 0.014585829846820697, + "loss": 3.1847, + "mean_token_accuracy": 0.393589586019516, + "num_tokens": 3672848199.0, + "step": 7184 + }, + { + "epoch": 1.9429421308815575, + "grad_norm": 3.578125, + "learning_rate": 0.01458438383484338, + "loss": 3.0681, + "mean_token_accuracy": 0.40398362278938293, + "num_tokens": 3673321624.0, + "step": 7185 + }, + { + "epoch": 1.9432125473228772, + "grad_norm": 3.546875, + "learning_rate": 0.014582937712886625, + "loss": 3.392, + "mean_token_accuracy": 0.3999842405319214, + "num_tokens": 3673808632.0, + "step": 7186 + }, + { + "epoch": 1.9434829637641968, + "grad_norm": 3.109375, + "learning_rate": 0.014581491480994803, + "loss": 3.1457, + "mean_token_accuracy": 0.39393389225006104, + "num_tokens": 3674332900.0, + "step": 7187 + }, + { + "epoch": 1.9437533802055165, + "grad_norm": 2.484375, + "learning_rate": 0.014580045139212294, + "loss": 3.0536, + "mean_token_accuracy": 0.40286949276924133, + "num_tokens": 3674857138.0, + "step": 7188 + }, + { + "epoch": 1.944023796646836, + "grad_norm": 2.28125, + "learning_rate": 0.014578598687583471, + "loss": 3.0975, + "mean_token_accuracy": 0.3977818489074707, + "num_tokens": 3675381406.0, + "step": 7189 + }, + { + "epoch": 1.9442942130881558, + "grad_norm": 2.921875, + "learning_rate": 0.014577152126152714, + "loss": 3.3253, + "mean_token_accuracy": 0.38439443707466125, + "num_tokens": 3675905655.0, + "step": 7190 + }, + { + "epoch": 1.9445646295294754, + "grad_norm": 58.25, + "learning_rate": 0.014575705454964411, + "loss": 16.0255, + "mean_token_accuracy": 0.012479146011173725, + "num_tokens": 3676429931.0, + "step": 7191 + }, + { + "epoch": 1.944835045970795, + "grad_norm": 6.21875, + "learning_rate": 0.014574258674062951, + "loss": 3.37, + "mean_token_accuracy": 0.34956812858581543, + "num_tokens": 3676954083.0, + "step": 7192 + }, + { + "epoch": 1.9451054624121147, + "grad_norm": 2.96875, + "learning_rate": 0.014572811783492724, + "loss": 3.2853, + "mean_token_accuracy": 0.3934803009033203, + "num_tokens": 3677478267.0, + "step": 7193 + }, + { + "epoch": 1.9453758788534343, + "grad_norm": 2.203125, + "learning_rate": 0.01457136478329812, + "loss": 2.9865, + "mean_token_accuracy": 0.4069439172744751, + "num_tokens": 3678002520.0, + "step": 7194 + }, + { + "epoch": 1.945646295294754, + "grad_norm": 3.015625, + "learning_rate": 0.014569917673523542, + "loss": 3.1923, + "mean_token_accuracy": 0.37704628705978394, + "num_tokens": 3678523656.0, + "step": 7195 + }, + { + "epoch": 1.9459167117360736, + "grad_norm": 3.46875, + "learning_rate": 0.014568470454213398, + "loss": 2.9776, + "mean_token_accuracy": 0.3870619833469391, + "num_tokens": 3678978282.0, + "step": 7196 + }, + { + "epoch": 1.9461871281773933, + "grad_norm": 2.046875, + "learning_rate": 0.01456702312541208, + "loss": 3.1823, + "mean_token_accuracy": 0.40193066000938416, + "num_tokens": 3679462999.0, + "step": 7197 + }, + { + "epoch": 1.946457544618713, + "grad_norm": 3.8125, + "learning_rate": 0.014565575687164005, + "loss": 3.1438, + "mean_token_accuracy": 0.42719167470932007, + "num_tokens": 3679936606.0, + "step": 7198 + }, + { + "epoch": 1.9467279610600325, + "grad_norm": 3.046875, + "learning_rate": 0.014564128139513582, + "loss": 3.4948, + "mean_token_accuracy": 0.36390453577041626, + "num_tokens": 3680459571.0, + "step": 7199 + }, + { + "epoch": 1.9469983775013522, + "grad_norm": 3.34375, + "learning_rate": 0.014562680482505225, + "loss": 3.0552, + "mean_token_accuracy": 0.3856143653392792, + "num_tokens": 3680983821.0, + "step": 7200 + }, + { + "epoch": 1.9472687939426718, + "grad_norm": 2.28125, + "learning_rate": 0.014561232716183354, + "loss": 3.3215, + "mean_token_accuracy": 0.3756641745567322, + "num_tokens": 3681508097.0, + "step": 7201 + }, + { + "epoch": 1.9475392103839915, + "grad_norm": 2.953125, + "learning_rate": 0.014559784840592392, + "loss": 3.2405, + "mean_token_accuracy": 0.3783499002456665, + "num_tokens": 3682032366.0, + "step": 7202 + }, + { + "epoch": 1.9478096268253111, + "grad_norm": 2.375, + "learning_rate": 0.014558336855776762, + "loss": 3.231, + "mean_token_accuracy": 0.3860495686531067, + "num_tokens": 3682556570.0, + "step": 7203 + }, + { + "epoch": 1.9480800432666308, + "grad_norm": 3.28125, + "learning_rate": 0.01455688876178089, + "loss": 3.2242, + "mean_token_accuracy": 0.3903180956840515, + "num_tokens": 3683055528.0, + "step": 7204 + }, + { + "epoch": 1.9483504597079504, + "grad_norm": 2.984375, + "learning_rate": 0.014555440558649213, + "loss": 3.3725, + "mean_token_accuracy": 0.37530815601348877, + "num_tokens": 3683579787.0, + "step": 7205 + }, + { + "epoch": 1.9486208761492698, + "grad_norm": 4.28125, + "learning_rate": 0.014553992246426164, + "loss": 3.2874, + "mean_token_accuracy": 0.36799919605255127, + "num_tokens": 3684103861.0, + "step": 7206 + }, + { + "epoch": 1.9488912925905895, + "grad_norm": 2.359375, + "learning_rate": 0.01455254382515618, + "loss": 3.236, + "mean_token_accuracy": 0.37685152888298035, + "num_tokens": 3684628127.0, + "step": 7207 + }, + { + "epoch": 1.949161709031909, + "grad_norm": 3.296875, + "learning_rate": 0.014551095294883708, + "loss": 3.1902, + "mean_token_accuracy": 0.3739197850227356, + "num_tokens": 3685130213.0, + "step": 7208 + }, + { + "epoch": 1.9494321254732287, + "grad_norm": 2.375, + "learning_rate": 0.014549646655653186, + "loss": 3.1147, + "mean_token_accuracy": 0.4390365183353424, + "num_tokens": 3685589798.0, + "step": 7209 + }, + { + "epoch": 1.9497025419145484, + "grad_norm": 3.828125, + "learning_rate": 0.014548197907509067, + "loss": 3.1735, + "mean_token_accuracy": 0.39939823746681213, + "num_tokens": 3686114019.0, + "step": 7210 + }, + { + "epoch": 1.949972958355868, + "grad_norm": 15.25, + "learning_rate": 0.0145467490504958, + "loss": 17.8634, + "mean_token_accuracy": 9.947496437234804e-06, + "num_tokens": 3686638257.0, + "step": 7211 + }, + { + "epoch": 1.9502433747971877, + "grad_norm": 5.46875, + "learning_rate": 0.014545300084657842, + "loss": 3.4637, + "mean_token_accuracy": 0.38314926624298096, + "num_tokens": 3687064563.0, + "step": 7212 + }, + { + "epoch": 1.9505137912385073, + "grad_norm": 2.015625, + "learning_rate": 0.01454385101003965, + "loss": 3.3226, + "mean_token_accuracy": 0.3761582374572754, + "num_tokens": 3687588834.0, + "step": 7213 + }, + { + "epoch": 1.950784207679827, + "grad_norm": 2.5, + "learning_rate": 0.014542401826685689, + "loss": 2.7522, + "mean_token_accuracy": 0.4132331907749176, + "num_tokens": 3688113002.0, + "step": 7214 + }, + { + "epoch": 1.9510546241211466, + "grad_norm": 2.828125, + "learning_rate": 0.014540952534640425, + "loss": 3.3203, + "mean_token_accuracy": 0.39738553762435913, + "num_tokens": 3688623920.0, + "step": 7215 + }, + { + "epoch": 1.9513250405624663, + "grad_norm": 3.578125, + "learning_rate": 0.014539503133948323, + "loss": 3.253, + "mean_token_accuracy": 0.36030101776123047, + "num_tokens": 3689147996.0, + "step": 7216 + }, + { + "epoch": 1.9515954570037857, + "grad_norm": 3.09375, + "learning_rate": 0.014538053624653855, + "loss": 3.4964, + "mean_token_accuracy": 0.37923985719680786, + "num_tokens": 3689672217.0, + "step": 7217 + }, + { + "epoch": 1.9518658734451053, + "grad_norm": 3.515625, + "learning_rate": 0.014536604006801497, + "loss": 3.1736, + "mean_token_accuracy": 0.3946347236633301, + "num_tokens": 3690133697.0, + "step": 7218 + }, + { + "epoch": 1.952136289886425, + "grad_norm": 2.890625, + "learning_rate": 0.014535154280435725, + "loss": 3.2819, + "mean_token_accuracy": 0.3896426260471344, + "num_tokens": 3690606135.0, + "step": 7219 + }, + { + "epoch": 1.9524067063277446, + "grad_norm": 3.25, + "learning_rate": 0.014533704445601023, + "loss": 3.0964, + "mean_token_accuracy": 0.4175615608692169, + "num_tokens": 3691086986.0, + "step": 7220 + }, + { + "epoch": 1.9526771227690642, + "grad_norm": 2.515625, + "learning_rate": 0.01453225450234188, + "loss": 3.2222, + "mean_token_accuracy": 0.3950960040092468, + "num_tokens": 3691597495.0, + "step": 7221 + }, + { + "epoch": 1.9529475392103839, + "grad_norm": 2.875, + "learning_rate": 0.014530804450702782, + "loss": 3.3114, + "mean_token_accuracy": 0.39618298411369324, + "num_tokens": 3692101349.0, + "step": 7222 + }, + { + "epoch": 1.9532179556517035, + "grad_norm": 2.328125, + "learning_rate": 0.014529354290728219, + "loss": 3.0068, + "mean_token_accuracy": 0.3907544016838074, + "num_tokens": 3692625520.0, + "step": 7223 + }, + { + "epoch": 1.9534883720930232, + "grad_norm": 2.40625, + "learning_rate": 0.014527904022462686, + "loss": 3.1144, + "mean_token_accuracy": 0.42092791199684143, + "num_tokens": 3693124420.0, + "step": 7224 + }, + { + "epoch": 1.9537587885343428, + "grad_norm": 2.96875, + "learning_rate": 0.014526453645950684, + "loss": 3.1876, + "mean_token_accuracy": 0.391930490732193, + "num_tokens": 3693648698.0, + "step": 7225 + }, + { + "epoch": 1.9540292049756625, + "grad_norm": 2.5625, + "learning_rate": 0.014525003161236713, + "loss": 3.0735, + "mean_token_accuracy": 0.38953253626823425, + "num_tokens": 3694172965.0, + "step": 7226 + }, + { + "epoch": 1.954299621416982, + "grad_norm": 2.703125, + "learning_rate": 0.014523552568365277, + "loss": 3.1458, + "mean_token_accuracy": 0.38594645261764526, + "num_tokens": 3694697142.0, + "step": 7227 + }, + { + "epoch": 1.9545700378583017, + "grad_norm": 2.359375, + "learning_rate": 0.014522101867380888, + "loss": 3.0265, + "mean_token_accuracy": 0.3981938362121582, + "num_tokens": 3695221411.0, + "step": 7228 + }, + { + "epoch": 1.9548404542996214, + "grad_norm": 3.078125, + "learning_rate": 0.014520651058328057, + "loss": 3.1702, + "mean_token_accuracy": 0.39534616470336914, + "num_tokens": 3695745410.0, + "step": 7229 + }, + { + "epoch": 1.955110870740941, + "grad_norm": 3.171875, + "learning_rate": 0.014519200141251297, + "loss": 3.3701, + "mean_token_accuracy": 0.38755038380622864, + "num_tokens": 3696261736.0, + "step": 7230 + }, + { + "epoch": 1.9553812871822607, + "grad_norm": 86.5, + "learning_rate": 0.014517749116195125, + "loss": 12.855, + "mean_token_accuracy": 0.024499529972672462, + "num_tokens": 3696747037.0, + "step": 7231 + }, + { + "epoch": 1.9556517036235803, + "grad_norm": 6.40625, + "learning_rate": 0.014516297983204068, + "loss": 3.6004, + "mean_token_accuracy": 0.36978578567504883, + "num_tokens": 3697242524.0, + "step": 7232 + }, + { + "epoch": 1.9559221200649, + "grad_norm": 2.59375, + "learning_rate": 0.014514846742322647, + "loss": 3.2405, + "mean_token_accuracy": 0.3859421908855438, + "num_tokens": 3697766775.0, + "step": 7233 + }, + { + "epoch": 1.9561925365062196, + "grad_norm": 3.296875, + "learning_rate": 0.014513395393595393, + "loss": 3.2442, + "mean_token_accuracy": 0.3900899291038513, + "num_tokens": 3698291050.0, + "step": 7234 + }, + { + "epoch": 1.9564629529475392, + "grad_norm": 2.953125, + "learning_rate": 0.014511943937066831, + "loss": 3.1589, + "mean_token_accuracy": 0.37794163823127747, + "num_tokens": 3698815168.0, + "step": 7235 + }, + { + "epoch": 1.956733369388859, + "grad_norm": 2.34375, + "learning_rate": 0.014510492372781505, + "loss": 3.1758, + "mean_token_accuracy": 0.4062238931655884, + "num_tokens": 3699337064.0, + "step": 7236 + }, + { + "epoch": 1.9570037858301785, + "grad_norm": 2.6875, + "learning_rate": 0.01450904070078395, + "loss": 3.1043, + "mean_token_accuracy": 0.3887491226196289, + "num_tokens": 3699861112.0, + "step": 7237 + }, + { + "epoch": 1.9572742022714982, + "grad_norm": 3.40625, + "learning_rate": 0.014507588921118706, + "loss": 3.0924, + "mean_token_accuracy": 0.3973062336444855, + "num_tokens": 3700329212.0, + "step": 7238 + }, + { + "epoch": 1.9575446187128178, + "grad_norm": 2.59375, + "learning_rate": 0.01450613703383032, + "loss": 3.4016, + "mean_token_accuracy": 0.3777681589126587, + "num_tokens": 3700841157.0, + "step": 7239 + }, + { + "epoch": 1.9578150351541375, + "grad_norm": 4.0, + "learning_rate": 0.014504685038963336, + "loss": 3.3268, + "mean_token_accuracy": 0.4379022419452667, + "num_tokens": 3701232543.0, + "step": 7240 + }, + { + "epoch": 1.958085451595457, + "grad_norm": 3.03125, + "learning_rate": 0.01450323293656231, + "loss": 2.7389, + "mean_token_accuracy": 0.46232062578201294, + "num_tokens": 3701733432.0, + "step": 7241 + }, + { + "epoch": 1.9583558680367767, + "grad_norm": 2.5, + "learning_rate": 0.014501780726671796, + "loss": 3.2098, + "mean_token_accuracy": 0.4050573408603668, + "num_tokens": 3702257636.0, + "step": 7242 + }, + { + "epoch": 1.9586262844780964, + "grad_norm": 2.734375, + "learning_rate": 0.01450032840933635, + "loss": 3.1397, + "mean_token_accuracy": 0.40825486183166504, + "num_tokens": 3702781817.0, + "step": 7243 + }, + { + "epoch": 1.958896700919416, + "grad_norm": 2.125, + "learning_rate": 0.014498875984600534, + "loss": 3.1808, + "mean_token_accuracy": 0.4016724228858948, + "num_tokens": 3703305990.0, + "step": 7244 + }, + { + "epoch": 1.9591671173607357, + "grad_norm": 3.015625, + "learning_rate": 0.014497423452508915, + "loss": 3.1751, + "mean_token_accuracy": 0.3898567259311676, + "num_tokens": 3703830182.0, + "step": 7245 + }, + { + "epoch": 1.9594375338020553, + "grad_norm": 3.03125, + "learning_rate": 0.01449597081310606, + "loss": 3.2923, + "mean_token_accuracy": 0.39606818556785583, + "num_tokens": 3704354402.0, + "step": 7246 + }, + { + "epoch": 1.959707950243375, + "grad_norm": 2.75, + "learning_rate": 0.014494518066436536, + "loss": 3.1025, + "mean_token_accuracy": 0.40273475646972656, + "num_tokens": 3704828565.0, + "step": 7247 + }, + { + "epoch": 1.9599783666846944, + "grad_norm": 4.09375, + "learning_rate": 0.014493065212544926, + "loss": 2.926, + "mean_token_accuracy": 0.4072679877281189, + "num_tokens": 3705295395.0, + "step": 7248 + }, + { + "epoch": 1.960248783126014, + "grad_norm": 2.796875, + "learning_rate": 0.014491612251475796, + "loss": 3.2835, + "mean_token_accuracy": 0.3905832767486572, + "num_tokens": 3705819614.0, + "step": 7249 + }, + { + "epoch": 1.9605191995673337, + "grad_norm": 3.890625, + "learning_rate": 0.014490159183273739, + "loss": 3.3785, + "mean_token_accuracy": 0.39050114154815674, + "num_tokens": 3706343763.0, + "step": 7250 + }, + { + "epoch": 1.9607896160086533, + "grad_norm": 127.5, + "learning_rate": 0.014488706007983335, + "loss": 15.0155, + "mean_token_accuracy": 0.01921393722295761, + "num_tokens": 3706868041.0, + "step": 7251 + }, + { + "epoch": 1.961060032449973, + "grad_norm": 5.75, + "learning_rate": 0.01448725272564917, + "loss": 3.6222, + "mean_token_accuracy": 0.3532320261001587, + "num_tokens": 3707392308.0, + "step": 7252 + }, + { + "epoch": 1.9613304488912926, + "grad_norm": 2.140625, + "learning_rate": 0.014485799336315832, + "loss": 3.4372, + "mean_token_accuracy": 0.36164379119873047, + "num_tokens": 3707916584.0, + "step": 7253 + }, + { + "epoch": 1.9616008653326122, + "grad_norm": 2.59375, + "learning_rate": 0.014484345840027921, + "loss": 3.2717, + "mean_token_accuracy": 0.39689815044403076, + "num_tokens": 3708410658.0, + "step": 7254 + }, + { + "epoch": 1.9618712817739319, + "grad_norm": 3.515625, + "learning_rate": 0.014482892236830037, + "loss": 3.0695, + "mean_token_accuracy": 0.39701369404792786, + "num_tokens": 3708934894.0, + "step": 7255 + }, + { + "epoch": 1.9621416982152515, + "grad_norm": 2.6875, + "learning_rate": 0.014481438526766772, + "loss": 3.236, + "mean_token_accuracy": 0.3826150894165039, + "num_tokens": 3709459048.0, + "step": 7256 + }, + { + "epoch": 1.9624121146565712, + "grad_norm": 2.890625, + "learning_rate": 0.014479984709882739, + "loss": 2.9072, + "mean_token_accuracy": 0.4355042278766632, + "num_tokens": 3709983167.0, + "step": 7257 + }, + { + "epoch": 1.9626825310978906, + "grad_norm": 2.46875, + "learning_rate": 0.014478530786222538, + "loss": 3.3528, + "mean_token_accuracy": 0.3675364553928375, + "num_tokens": 3710507294.0, + "step": 7258 + }, + { + "epoch": 1.9629529475392102, + "grad_norm": 2.84375, + "learning_rate": 0.014477076755830785, + "loss": 3.3262, + "mean_token_accuracy": 0.3779369592666626, + "num_tokens": 3711031433.0, + "step": 7259 + }, + { + "epoch": 1.9632233639805299, + "grad_norm": 3.4375, + "learning_rate": 0.014475622618752088, + "loss": 3.4624, + "mean_token_accuracy": 0.4003199636936188, + "num_tokens": 3711474188.0, + "step": 7260 + }, + { + "epoch": 1.9634937804218495, + "grad_norm": 3.203125, + "learning_rate": 0.014474168375031073, + "loss": 3.2982, + "mean_token_accuracy": 0.3910820782184601, + "num_tokens": 3711948449.0, + "step": 7261 + }, + { + "epoch": 1.9637641968631692, + "grad_norm": 3.375, + "learning_rate": 0.014472714024712353, + "loss": 2.9924, + "mean_token_accuracy": 0.4106810986995697, + "num_tokens": 3712472629.0, + "step": 7262 + }, + { + "epoch": 1.9640346133044888, + "grad_norm": 2.859375, + "learning_rate": 0.01447125956784055, + "loss": 3.2626, + "mean_token_accuracy": 0.383162260055542, + "num_tokens": 3712979380.0, + "step": 7263 + }, + { + "epoch": 1.9643050297458085, + "grad_norm": 2.671875, + "learning_rate": 0.0144698050044603, + "loss": 3.0974, + "mean_token_accuracy": 0.4152708649635315, + "num_tokens": 3713428472.0, + "step": 7264 + }, + { + "epoch": 1.964575446187128, + "grad_norm": 3.25, + "learning_rate": 0.014468350334616228, + "loss": 3.4978, + "mean_token_accuracy": 0.38408753275871277, + "num_tokens": 3713952750.0, + "step": 7265 + }, + { + "epoch": 1.9648458626284477, + "grad_norm": 3.1875, + "learning_rate": 0.014466895558352968, + "loss": 3.0498, + "mean_token_accuracy": 0.3893270790576935, + "num_tokens": 3714476965.0, + "step": 7266 + }, + { + "epoch": 1.9651162790697674, + "grad_norm": 2.796875, + "learning_rate": 0.014465440675715159, + "loss": 3.2409, + "mean_token_accuracy": 0.37888142466545105, + "num_tokens": 3715001246.0, + "step": 7267 + }, + { + "epoch": 1.965386695511087, + "grad_norm": 3.671875, + "learning_rate": 0.014463985686747434, + "loss": 3.4647, + "mean_token_accuracy": 0.369617760181427, + "num_tokens": 3715525470.0, + "step": 7268 + }, + { + "epoch": 1.9656571119524067, + "grad_norm": 2.6875, + "learning_rate": 0.014462530591494443, + "loss": 3.2214, + "mean_token_accuracy": 0.38021320104599, + "num_tokens": 3716049722.0, + "step": 7269 + }, + { + "epoch": 1.9659275283937263, + "grad_norm": 2.421875, + "learning_rate": 0.01446107539000083, + "loss": 3.3231, + "mean_token_accuracy": 0.38721221685409546, + "num_tokens": 3716573970.0, + "step": 7270 + }, + { + "epoch": 1.966197944835046, + "grad_norm": 33.25, + "learning_rate": 0.01445962008231125, + "loss": 9.595, + "mean_token_accuracy": 0.01602986454963684, + "num_tokens": 3717088873.0, + "step": 7271 + }, + { + "epoch": 1.9664683612763656, + "grad_norm": 7.4375, + "learning_rate": 0.014458164668470346, + "loss": 3.8942, + "mean_token_accuracy": 0.3135097026824951, + "num_tokens": 3717593329.0, + "step": 7272 + }, + { + "epoch": 1.9667387777176852, + "grad_norm": 2.375, + "learning_rate": 0.014456709148522784, + "loss": 3.1171, + "mean_token_accuracy": 0.384726881980896, + "num_tokens": 3718093675.0, + "step": 7273 + }, + { + "epoch": 1.9670091941590049, + "grad_norm": 2.453125, + "learning_rate": 0.014455253522513218, + "loss": 2.9748, + "mean_token_accuracy": 0.4344140589237213, + "num_tokens": 3718617955.0, + "step": 7274 + }, + { + "epoch": 1.9672796106003245, + "grad_norm": 2.71875, + "learning_rate": 0.014453797790486314, + "loss": 3.1868, + "mean_token_accuracy": 0.3988712430000305, + "num_tokens": 3719134697.0, + "step": 7275 + }, + { + "epoch": 1.9675500270416442, + "grad_norm": 2.484375, + "learning_rate": 0.014452341952486736, + "loss": 3.4001, + "mean_token_accuracy": 0.39301711320877075, + "num_tokens": 3719606023.0, + "step": 7276 + }, + { + "epoch": 1.9678204434829638, + "grad_norm": 2.609375, + "learning_rate": 0.014450886008559155, + "loss": 3.329, + "mean_token_accuracy": 0.36202627420425415, + "num_tokens": 3720130309.0, + "step": 7277 + }, + { + "epoch": 1.9680908599242835, + "grad_norm": 2.34375, + "learning_rate": 0.014449429958748239, + "loss": 3.2376, + "mean_token_accuracy": 0.3965771794319153, + "num_tokens": 3720654415.0, + "step": 7278 + }, + { + "epoch": 1.968361276365603, + "grad_norm": 3.125, + "learning_rate": 0.014447973803098674, + "loss": 3.1442, + "mean_token_accuracy": 0.3879443407058716, + "num_tokens": 3721178557.0, + "step": 7279 + }, + { + "epoch": 1.9686316928069227, + "grad_norm": 3.0, + "learning_rate": 0.014446517541655126, + "loss": 3.3443, + "mean_token_accuracy": 0.39590126276016235, + "num_tokens": 3721639146.0, + "step": 7280 + }, + { + "epoch": 1.9689021092482424, + "grad_norm": 2.515625, + "learning_rate": 0.01444506117446229, + "loss": 3.0041, + "mean_token_accuracy": 0.39554476737976074, + "num_tokens": 3722163415.0, + "step": 7281 + }, + { + "epoch": 1.969172525689562, + "grad_norm": 2.390625, + "learning_rate": 0.014443604701564845, + "loss": 3.1763, + "mean_token_accuracy": 0.4120725393295288, + "num_tokens": 3722676133.0, + "step": 7282 + }, + { + "epoch": 1.9694429421308817, + "grad_norm": 3.203125, + "learning_rate": 0.01444214812300748, + "loss": 3.277, + "mean_token_accuracy": 0.3992076516151428, + "num_tokens": 3723200229.0, + "step": 7283 + }, + { + "epoch": 1.9697133585722013, + "grad_norm": 3.21875, + "learning_rate": 0.014440691438834885, + "loss": 3.2167, + "mean_token_accuracy": 0.39833950996398926, + "num_tokens": 3723688616.0, + "step": 7284 + }, + { + "epoch": 1.969983775013521, + "grad_norm": 2.9375, + "learning_rate": 0.014439234649091762, + "loss": 3.2525, + "mean_token_accuracy": 0.4023050367832184, + "num_tokens": 3724212799.0, + "step": 7285 + }, + { + "epoch": 1.9702541914548406, + "grad_norm": 6.4375, + "learning_rate": 0.014437777753822803, + "loss": 3.009, + "mean_token_accuracy": 0.4202535152435303, + "num_tokens": 3724710468.0, + "step": 7286 + }, + { + "epoch": 1.9705246078961602, + "grad_norm": 2.921875, + "learning_rate": 0.014436320753072718, + "loss": 3.1634, + "mean_token_accuracy": 0.3814569413661957, + "num_tokens": 3725234679.0, + "step": 7287 + }, + { + "epoch": 1.9707950243374799, + "grad_norm": 2.875, + "learning_rate": 0.014434863646886203, + "loss": 3.2073, + "mean_token_accuracy": 0.38667166233062744, + "num_tokens": 3725758910.0, + "step": 7288 + }, + { + "epoch": 1.9710654407787993, + "grad_norm": 4.09375, + "learning_rate": 0.01443340643530797, + "loss": 3.1299, + "mean_token_accuracy": 0.36650192737579346, + "num_tokens": 3726283168.0, + "step": 7289 + }, + { + "epoch": 1.971335857220119, + "grad_norm": 2.0625, + "learning_rate": 0.01443194911838273, + "loss": 3.2243, + "mean_token_accuracy": 0.37787485122680664, + "num_tokens": 3726807445.0, + "step": 7290 + }, + { + "epoch": 1.9716062736614386, + "grad_norm": 35.75, + "learning_rate": 0.0144304916961552, + "loss": 13.1656, + "mean_token_accuracy": 0.009236900135874748, + "num_tokens": 3727331697.0, + "step": 7291 + }, + { + "epoch": 1.9718766901027582, + "grad_norm": 8.5, + "learning_rate": 0.014429034168670094, + "loss": 3.8986, + "mean_token_accuracy": 0.3372492790222168, + "num_tokens": 3727824451.0, + "step": 7292 + }, + { + "epoch": 1.9721471065440779, + "grad_norm": 3.328125, + "learning_rate": 0.014427576535972136, + "loss": 3.5444, + "mean_token_accuracy": 0.38086697459220886, + "num_tokens": 3728348661.0, + "step": 7293 + }, + { + "epoch": 1.9724175229853975, + "grad_norm": 4.40625, + "learning_rate": 0.014426118798106053, + "loss": 3.238, + "mean_token_accuracy": 0.38381966948509216, + "num_tokens": 3728868165.0, + "step": 7294 + }, + { + "epoch": 1.9726879394267172, + "grad_norm": 3.015625, + "learning_rate": 0.014424660955116568, + "loss": 3.3836, + "mean_token_accuracy": 0.37889954447746277, + "num_tokens": 3729392415.0, + "step": 7295 + }, + { + "epoch": 1.9729583558680368, + "grad_norm": 3.1875, + "learning_rate": 0.014423203007048413, + "loss": 3.2554, + "mean_token_accuracy": 0.39002686738967896, + "num_tokens": 3729916685.0, + "step": 7296 + }, + { + "epoch": 1.9732287723093564, + "grad_norm": 11.3125, + "learning_rate": 0.014421744953946321, + "loss": 2.9695, + "mean_token_accuracy": 0.42202699184417725, + "num_tokens": 3730440770.0, + "step": 7297 + }, + { + "epoch": 1.973499188750676, + "grad_norm": 3.078125, + "learning_rate": 0.014420286795855034, + "loss": 3.4088, + "mean_token_accuracy": 0.3927641212940216, + "num_tokens": 3730964939.0, + "step": 7298 + }, + { + "epoch": 1.9737696051919955, + "grad_norm": 2.40625, + "learning_rate": 0.014418828532819289, + "loss": 3.197, + "mean_token_accuracy": 0.41703906655311584, + "num_tokens": 3731428005.0, + "step": 7299 + }, + { + "epoch": 1.9740400216333152, + "grad_norm": 3.59375, + "learning_rate": 0.01441737016488383, + "loss": 3.1565, + "mean_token_accuracy": 0.381928026676178, + "num_tokens": 3731952169.0, + "step": 7300 + }, + { + "epoch": 1.9743104380746348, + "grad_norm": 3.0, + "learning_rate": 0.014415911692093407, + "loss": 3.249, + "mean_token_accuracy": 0.3960442543029785, + "num_tokens": 3732428546.0, + "step": 7301 + }, + { + "epoch": 1.9745808545159544, + "grad_norm": 2.671875, + "learning_rate": 0.014414453114492765, + "loss": 3.198, + "mean_token_accuracy": 0.39431852102279663, + "num_tokens": 3732910551.0, + "step": 7302 + }, + { + "epoch": 1.974851270957274, + "grad_norm": 2.953125, + "learning_rate": 0.014412994432126658, + "loss": 3.2794, + "mean_token_accuracy": 0.37999051809310913, + "num_tokens": 3733434790.0, + "step": 7303 + }, + { + "epoch": 1.9751216873985937, + "grad_norm": 2.3125, + "learning_rate": 0.014411535645039851, + "loss": 3.2935, + "mean_token_accuracy": 0.3996860086917877, + "num_tokens": 3733935952.0, + "step": 7304 + }, + { + "epoch": 1.9753921038399134, + "grad_norm": 2.53125, + "learning_rate": 0.014410076753277093, + "loss": 3.3352, + "mean_token_accuracy": 0.3567744493484497, + "num_tokens": 3734460200.0, + "step": 7305 + }, + { + "epoch": 1.975662520281233, + "grad_norm": 2.65625, + "learning_rate": 0.014408617756883154, + "loss": 3.1817, + "mean_token_accuracy": 0.3838418126106262, + "num_tokens": 3734984328.0, + "step": 7306 + }, + { + "epoch": 1.9759329367225527, + "grad_norm": 2.5, + "learning_rate": 0.014407158655902797, + "loss": 3.1921, + "mean_token_accuracy": 0.39963364601135254, + "num_tokens": 3735508524.0, + "step": 7307 + }, + { + "epoch": 1.9762033531638723, + "grad_norm": 2.59375, + "learning_rate": 0.014405699450380794, + "loss": 2.9035, + "mean_token_accuracy": 0.4124755859375, + "num_tokens": 3735978552.0, + "step": 7308 + }, + { + "epoch": 1.976473769605192, + "grad_norm": 2.65625, + "learning_rate": 0.014404240140361915, + "loss": 3.1528, + "mean_token_accuracy": 0.40852072834968567, + "num_tokens": 3736502827.0, + "step": 7309 + }, + { + "epoch": 1.9767441860465116, + "grad_norm": 3.796875, + "learning_rate": 0.014402780725890938, + "loss": 3.2163, + "mean_token_accuracy": 0.3818734288215637, + "num_tokens": 3737026945.0, + "step": 7310 + }, + { + "epoch": 1.9770146024878312, + "grad_norm": 56.0, + "learning_rate": 0.014401321207012641, + "loss": 13.9575, + "mean_token_accuracy": 0.008732074871659279, + "num_tokens": 3737551209.0, + "step": 7311 + }, + { + "epoch": 1.9772850189291509, + "grad_norm": 6.09375, + "learning_rate": 0.014399861583771805, + "loss": 3.6387, + "mean_token_accuracy": 0.3395862877368927, + "num_tokens": 3738027998.0, + "step": 7312 + }, + { + "epoch": 1.9775554353704705, + "grad_norm": 18.875, + "learning_rate": 0.014398401856213218, + "loss": 3.5127, + "mean_token_accuracy": 0.3694630265235901, + "num_tokens": 3738549884.0, + "step": 7313 + }, + { + "epoch": 1.9778258518117902, + "grad_norm": 3.203125, + "learning_rate": 0.01439694202438167, + "loss": 3.3979, + "mean_token_accuracy": 0.3700546622276306, + "num_tokens": 3739074063.0, + "step": 7314 + }, + { + "epoch": 1.9780962682531098, + "grad_norm": 2.84375, + "learning_rate": 0.014395482088321952, + "loss": 3.4717, + "mean_token_accuracy": 0.3916087746620178, + "num_tokens": 3739532882.0, + "step": 7315 + }, + { + "epoch": 1.9783666846944294, + "grad_norm": 4.1875, + "learning_rate": 0.014394022048078856, + "loss": 3.3163, + "mean_token_accuracy": 0.36460697650909424, + "num_tokens": 3740057154.0, + "step": 7316 + }, + { + "epoch": 1.978637101135749, + "grad_norm": 2.859375, + "learning_rate": 0.014392561903697183, + "loss": 3.3827, + "mean_token_accuracy": 0.3753807544708252, + "num_tokens": 3740581312.0, + "step": 7317 + }, + { + "epoch": 1.9789075175770687, + "grad_norm": 3.390625, + "learning_rate": 0.014391101655221734, + "loss": 3.6123, + "mean_token_accuracy": 0.35864579677581787, + "num_tokens": 3741105575.0, + "step": 7318 + }, + { + "epoch": 1.9791779340183884, + "grad_norm": 2.0625, + "learning_rate": 0.014389641302697312, + "loss": 3.035, + "mean_token_accuracy": 0.4086047410964966, + "num_tokens": 3741629703.0, + "step": 7319 + }, + { + "epoch": 1.979448350459708, + "grad_norm": 2.828125, + "learning_rate": 0.014388180846168728, + "loss": 3.2264, + "mean_token_accuracy": 0.36924028396606445, + "num_tokens": 3742153956.0, + "step": 7320 + }, + { + "epoch": 1.9797187669010277, + "grad_norm": 2.671875, + "learning_rate": 0.01438672028568079, + "loss": 3.1939, + "mean_token_accuracy": 0.39384347200393677, + "num_tokens": 3742638014.0, + "step": 7321 + }, + { + "epoch": 1.9799891833423473, + "grad_norm": 3.390625, + "learning_rate": 0.014385259621278314, + "loss": 3.3312, + "mean_token_accuracy": 0.380562424659729, + "num_tokens": 3743102592.0, + "step": 7322 + }, + { + "epoch": 1.980259599783667, + "grad_norm": 2.578125, + "learning_rate": 0.014383798853006119, + "loss": 3.3207, + "mean_token_accuracy": 0.3576399087905884, + "num_tokens": 3743626852.0, + "step": 7323 + }, + { + "epoch": 1.9805300162249866, + "grad_norm": 2.453125, + "learning_rate": 0.014382337980909026, + "loss": 3.1979, + "mean_token_accuracy": 0.37835296988487244, + "num_tokens": 3744151091.0, + "step": 7324 + }, + { + "epoch": 1.9808004326663062, + "grad_norm": 2.578125, + "learning_rate": 0.014380877005031854, + "loss": 3.188, + "mean_token_accuracy": 0.388580858707428, + "num_tokens": 3744675274.0, + "step": 7325 + }, + { + "epoch": 1.9810708491076259, + "grad_norm": 3.015625, + "learning_rate": 0.014379415925419432, + "loss": 3.1974, + "mean_token_accuracy": 0.3954789638519287, + "num_tokens": 3745147650.0, + "step": 7326 + }, + { + "epoch": 1.9813412655489455, + "grad_norm": 2.65625, + "learning_rate": 0.014377954742116592, + "loss": 3.1814, + "mean_token_accuracy": 0.40331411361694336, + "num_tokens": 3745671924.0, + "step": 7327 + }, + { + "epoch": 1.9816116819902652, + "grad_norm": 3.5625, + "learning_rate": 0.01437649345516817, + "loss": 3.1452, + "mean_token_accuracy": 0.38800737261772156, + "num_tokens": 3746196090.0, + "step": 7328 + }, + { + "epoch": 1.9818820984315848, + "grad_norm": 2.921875, + "learning_rate": 0.014375032064618995, + "loss": 3.1465, + "mean_token_accuracy": 0.3905753493309021, + "num_tokens": 3746720362.0, + "step": 7329 + }, + { + "epoch": 1.9821525148729042, + "grad_norm": 2.578125, + "learning_rate": 0.014373570570513914, + "loss": 3.0543, + "mean_token_accuracy": 0.39015719294548035, + "num_tokens": 3747244322.0, + "step": 7330 + }, + { + "epoch": 1.9824229313142239, + "grad_norm": 52.75, + "learning_rate": 0.014372108972897766, + "loss": 11.6188, + "mean_token_accuracy": 0.036038391292095184, + "num_tokens": 3747768525.0, + "step": 7331 + }, + { + "epoch": 1.9826933477555435, + "grad_norm": 7.5625, + "learning_rate": 0.014370647271815395, + "loss": 3.7498, + "mean_token_accuracy": 0.34077149629592896, + "num_tokens": 3748284271.0, + "step": 7332 + }, + { + "epoch": 1.9829637641968632, + "grad_norm": 2.859375, + "learning_rate": 0.014369185467311658, + "loss": 3.0945, + "mean_token_accuracy": 0.39708900451660156, + "num_tokens": 3748808467.0, + "step": 7333 + }, + { + "epoch": 1.9832341806381828, + "grad_norm": 2.21875, + "learning_rate": 0.014367723559431398, + "loss": 3.2645, + "mean_token_accuracy": 0.3841981291770935, + "num_tokens": 3749332667.0, + "step": 7334 + }, + { + "epoch": 1.9835045970795024, + "grad_norm": 3.390625, + "learning_rate": 0.014366261548219477, + "loss": 3.2491, + "mean_token_accuracy": 0.3976972699165344, + "num_tokens": 3749830661.0, + "step": 7335 + }, + { + "epoch": 1.983775013520822, + "grad_norm": 3.03125, + "learning_rate": 0.014364799433720755, + "loss": 3.1546, + "mean_token_accuracy": 0.3652495741844177, + "num_tokens": 3750354799.0, + "step": 7336 + }, + { + "epoch": 1.9840454299621417, + "grad_norm": 3.34375, + "learning_rate": 0.01436333721598009, + "loss": 3.4498, + "mean_token_accuracy": 0.3837316632270813, + "num_tokens": 3750878968.0, + "step": 7337 + }, + { + "epoch": 1.9843158464034614, + "grad_norm": 2.875, + "learning_rate": 0.014361874895042346, + "loss": 3.3432, + "mean_token_accuracy": 0.3591400384902954, + "num_tokens": 3751403167.0, + "step": 7338 + }, + { + "epoch": 1.984586262844781, + "grad_norm": 3.125, + "learning_rate": 0.014360412470952395, + "loss": 3.1242, + "mean_token_accuracy": 0.3904768228530884, + "num_tokens": 3751927337.0, + "step": 7339 + }, + { + "epoch": 1.9848566792861004, + "grad_norm": 3.59375, + "learning_rate": 0.014358949943755106, + "loss": 3.1804, + "mean_token_accuracy": 0.40899091958999634, + "num_tokens": 3752394733.0, + "step": 7340 + }, + { + "epoch": 1.98512709572742, + "grad_norm": 2.96875, + "learning_rate": 0.014357487313495355, + "loss": 3.234, + "mean_token_accuracy": 0.384666383266449, + "num_tokens": 3752894682.0, + "step": 7341 + }, + { + "epoch": 1.9853975121687397, + "grad_norm": 2.859375, + "learning_rate": 0.014356024580218021, + "loss": 3.357, + "mean_token_accuracy": 0.36209940910339355, + "num_tokens": 3753418906.0, + "step": 7342 + }, + { + "epoch": 1.9856679286100594, + "grad_norm": 2.703125, + "learning_rate": 0.014354561743967984, + "loss": 3.1242, + "mean_token_accuracy": 0.38786354660987854, + "num_tokens": 3753943157.0, + "step": 7343 + }, + { + "epoch": 1.985938345051379, + "grad_norm": 2.515625, + "learning_rate": 0.014353098804790125, + "loss": 3.1087, + "mean_token_accuracy": 0.39695462584495544, + "num_tokens": 3754467299.0, + "step": 7344 + }, + { + "epoch": 1.9862087614926986, + "grad_norm": 2.8125, + "learning_rate": 0.014351635762729336, + "loss": 3.2288, + "mean_token_accuracy": 0.39665496349334717, + "num_tokens": 3754991449.0, + "step": 7345 + }, + { + "epoch": 1.9864791779340183, + "grad_norm": 3.359375, + "learning_rate": 0.014350172617830503, + "loss": 3.2463, + "mean_token_accuracy": 0.3777565360069275, + "num_tokens": 3755515712.0, + "step": 7346 + }, + { + "epoch": 1.986749594375338, + "grad_norm": 2.984375, + "learning_rate": 0.014348709370138522, + "loss": 3.103, + "mean_token_accuracy": 0.3881019949913025, + "num_tokens": 3756039799.0, + "step": 7347 + }, + { + "epoch": 1.9870200108166576, + "grad_norm": 3.234375, + "learning_rate": 0.014347246019698293, + "loss": 2.9996, + "mean_token_accuracy": 0.41819483041763306, + "num_tokens": 3756488707.0, + "step": 7348 + }, + { + "epoch": 1.9872904272579772, + "grad_norm": 2.921875, + "learning_rate": 0.01434578256655471, + "loss": 3.3599, + "mean_token_accuracy": 0.3916993737220764, + "num_tokens": 3757012965.0, + "step": 7349 + }, + { + "epoch": 1.9875608436992969, + "grad_norm": 3.0, + "learning_rate": 0.01434431901075268, + "loss": 3.0859, + "mean_token_accuracy": 0.3781302571296692, + "num_tokens": 3757537083.0, + "step": 7350 + }, + { + "epoch": 1.9878312601406165, + "grad_norm": 152.0, + "learning_rate": 0.014342855352337107, + "loss": 14.6373, + "mean_token_accuracy": 0.006756306625902653, + "num_tokens": 3758061333.0, + "step": 7351 + }, + { + "epoch": 1.9881016765819362, + "grad_norm": 5.0625, + "learning_rate": 0.0143413915913529, + "loss": 3.4865, + "mean_token_accuracy": 0.37618350982666016, + "num_tokens": 3758575175.0, + "step": 7352 + }, + { + "epoch": 1.9883720930232558, + "grad_norm": 2.109375, + "learning_rate": 0.014339927727844974, + "loss": 3.2609, + "mean_token_accuracy": 0.3351917266845703, + "num_tokens": 3759099372.0, + "step": 7353 + }, + { + "epoch": 1.9886425094645754, + "grad_norm": 2.5, + "learning_rate": 0.014338463761858242, + "loss": 3.4579, + "mean_token_accuracy": 0.35659298300743103, + "num_tokens": 3759623545.0, + "step": 7354 + }, + { + "epoch": 1.988912925905895, + "grad_norm": 2.8125, + "learning_rate": 0.014336999693437625, + "loss": 2.9815, + "mean_token_accuracy": 0.38624101877212524, + "num_tokens": 3760147749.0, + "step": 7355 + }, + { + "epoch": 1.9891833423472147, + "grad_norm": 2.71875, + "learning_rate": 0.014335535522628043, + "loss": 2.929, + "mean_token_accuracy": 0.41000568866729736, + "num_tokens": 3760671982.0, + "step": 7356 + }, + { + "epoch": 1.9894537587885344, + "grad_norm": 2.828125, + "learning_rate": 0.01433407124947442, + "loss": 2.9468, + "mean_token_accuracy": 0.41140735149383545, + "num_tokens": 3761196240.0, + "step": 7357 + }, + { + "epoch": 1.989724175229854, + "grad_norm": 3.734375, + "learning_rate": 0.01433260687402169, + "loss": 3.2408, + "mean_token_accuracy": 0.3892973065376282, + "num_tokens": 3761720482.0, + "step": 7358 + }, + { + "epoch": 1.9899945916711737, + "grad_norm": 3.078125, + "learning_rate": 0.01433114239631478, + "loss": 3.2025, + "mean_token_accuracy": 0.39592909812927246, + "num_tokens": 3762216978.0, + "step": 7359 + }, + { + "epoch": 1.9902650081124933, + "grad_norm": 3.53125, + "learning_rate": 0.014329677816398625, + "loss": 3.2473, + "mean_token_accuracy": 0.39456886053085327, + "num_tokens": 3762741042.0, + "step": 7360 + }, + { + "epoch": 1.990535424553813, + "grad_norm": 2.625, + "learning_rate": 0.014328213134318162, + "loss": 3.3539, + "mean_token_accuracy": 0.3849338889122009, + "num_tokens": 3763265233.0, + "step": 7361 + }, + { + "epoch": 1.9908058409951326, + "grad_norm": 3.375, + "learning_rate": 0.01432674835011833, + "loss": 3.1263, + "mean_token_accuracy": 0.39950141310691833, + "num_tokens": 3763789474.0, + "step": 7362 + }, + { + "epoch": 1.9910762574364522, + "grad_norm": 3.0625, + "learning_rate": 0.014325283463844079, + "loss": 3.1368, + "mean_token_accuracy": 0.40491268038749695, + "num_tokens": 3764313648.0, + "step": 7363 + }, + { + "epoch": 1.9913466738777719, + "grad_norm": 2.828125, + "learning_rate": 0.01432381847554035, + "loss": 3.2418, + "mean_token_accuracy": 0.3959760069847107, + "num_tokens": 3764745305.0, + "step": 7364 + }, + { + "epoch": 1.9916170903190915, + "grad_norm": 2.078125, + "learning_rate": 0.014322353385252095, + "loss": 3.0497, + "mean_token_accuracy": 0.4179704785346985, + "num_tokens": 3765269489.0, + "step": 7365 + }, + { + "epoch": 1.9918875067604112, + "grad_norm": 2.640625, + "learning_rate": 0.014320888193024272, + "loss": 3.147, + "mean_token_accuracy": 0.40646880865097046, + "num_tokens": 3765793583.0, + "step": 7366 + }, + { + "epoch": 1.9921579232017308, + "grad_norm": 2.65625, + "learning_rate": 0.01431942289890183, + "loss": 3.215, + "mean_token_accuracy": 0.3953930735588074, + "num_tokens": 3766317830.0, + "step": 7367 + }, + { + "epoch": 1.9924283396430504, + "grad_norm": 2.359375, + "learning_rate": 0.014317957502929732, + "loss": 2.7588, + "mean_token_accuracy": 0.422358900308609, + "num_tokens": 3766842068.0, + "step": 7368 + }, + { + "epoch": 1.99269875608437, + "grad_norm": 2.625, + "learning_rate": 0.01431649200515294, + "loss": 3.2041, + "mean_token_accuracy": 0.41338929533958435, + "num_tokens": 3767302552.0, + "step": 7369 + }, + { + "epoch": 1.9929691725256897, + "grad_norm": 2.75, + "learning_rate": 0.01431502640561642, + "loss": 3.0373, + "mean_token_accuracy": 0.4083116054534912, + "num_tokens": 3767826673.0, + "step": 7370 + }, + { + "epoch": 1.9932395889670091, + "grad_norm": 115.5, + "learning_rate": 0.01431356070436514, + "loss": 13.1132, + "mean_token_accuracy": 0.015528419986367226, + "num_tokens": 3768307125.0, + "step": 7371 + }, + { + "epoch": 1.9935100054083288, + "grad_norm": 7.53125, + "learning_rate": 0.014312094901444075, + "loss": 3.4898, + "mean_token_accuracy": 0.38514670729637146, + "num_tokens": 3768831281.0, + "step": 7372 + }, + { + "epoch": 1.9937804218496484, + "grad_norm": 2.4375, + "learning_rate": 0.014310628996898198, + "loss": 3.3777, + "mean_token_accuracy": 0.3742295205593109, + "num_tokens": 3769355354.0, + "step": 7373 + }, + { + "epoch": 1.994050838290968, + "grad_norm": 3.234375, + "learning_rate": 0.014309162990772487, + "loss": 3.2246, + "mean_token_accuracy": 0.37476444244384766, + "num_tokens": 3769879633.0, + "step": 7374 + }, + { + "epoch": 1.9943212547322877, + "grad_norm": 2.828125, + "learning_rate": 0.014307696883111922, + "loss": 3.4173, + "mean_token_accuracy": 0.39134663343429565, + "num_tokens": 3770403880.0, + "step": 7375 + }, + { + "epoch": 1.9945916711736074, + "grad_norm": 3.09375, + "learning_rate": 0.01430623067396149, + "loss": 3.47, + "mean_token_accuracy": 0.3509160578250885, + "num_tokens": 3770928045.0, + "step": 7376 + }, + { + "epoch": 1.994862087614927, + "grad_norm": 2.640625, + "learning_rate": 0.01430476436336618, + "loss": 3.2624, + "mean_token_accuracy": 0.40465590357780457, + "num_tokens": 3771429239.0, + "step": 7377 + }, + { + "epoch": 1.9951325040562466, + "grad_norm": 3.203125, + "learning_rate": 0.01430329795137098, + "loss": 3.268, + "mean_token_accuracy": 0.3804241418838501, + "num_tokens": 3771953346.0, + "step": 7378 + }, + { + "epoch": 1.9954029204975663, + "grad_norm": 2.875, + "learning_rate": 0.014301831438020886, + "loss": 3.1808, + "mean_token_accuracy": 0.3909521698951721, + "num_tokens": 3772477518.0, + "step": 7379 + }, + { + "epoch": 1.995673336938886, + "grad_norm": 3.171875, + "learning_rate": 0.014300364823360891, + "loss": 3.127, + "mean_token_accuracy": 0.40064454078674316, + "num_tokens": 3773001790.0, + "step": 7380 + }, + { + "epoch": 1.9959437533802054, + "grad_norm": 2.796875, + "learning_rate": 0.014298898107436, + "loss": 3.1076, + "mean_token_accuracy": 0.410237580537796, + "num_tokens": 3773471386.0, + "step": 7381 + }, + { + "epoch": 1.996214169821525, + "grad_norm": 3.5625, + "learning_rate": 0.014297431290291209, + "loss": 3.0825, + "mean_token_accuracy": 0.3995654881000519, + "num_tokens": 3773920200.0, + "step": 7382 + }, + { + "epoch": 1.9964845862628446, + "grad_norm": 3.078125, + "learning_rate": 0.014295964371971533, + "loss": 3.1776, + "mean_token_accuracy": 0.3891933262348175, + "num_tokens": 3774444442.0, + "step": 7383 + }, + { + "epoch": 1.9967550027041643, + "grad_norm": 3.828125, + "learning_rate": 0.014294497352521975, + "loss": 3.2579, + "mean_token_accuracy": 0.3909934163093567, + "num_tokens": 3774964189.0, + "step": 7384 + }, + { + "epoch": 1.997025419145484, + "grad_norm": 8.6875, + "learning_rate": 0.014293030231987549, + "loss": 2.9955, + "mean_token_accuracy": 0.46080541610717773, + "num_tokens": 3775424309.0, + "step": 7385 + }, + { + "epoch": 1.9972958355868036, + "grad_norm": 37.25, + "learning_rate": 0.014291563010413275, + "loss": 3.7137, + "mean_token_accuracy": 0.3480307459831238, + "num_tokens": 3775948494.0, + "step": 7386 + }, + { + "epoch": 1.9975662520281232, + "grad_norm": 2.8125, + "learning_rate": 0.014290095687844166, + "loss": 3.2862, + "mean_token_accuracy": 0.3618844747543335, + "num_tokens": 3776472678.0, + "step": 7387 + }, + { + "epoch": 1.9978366684694429, + "grad_norm": 7.96875, + "learning_rate": 0.014288628264325245, + "loss": 3.0913, + "mean_token_accuracy": 0.39082974195480347, + "num_tokens": 3776962525.0, + "step": 7388 + }, + { + "epoch": 1.9981070849107625, + "grad_norm": 2.234375, + "learning_rate": 0.014287160739901541, + "loss": 3.2415, + "mean_token_accuracy": 0.39020872116088867, + "num_tokens": 3777473048.0, + "step": 7389 + }, + { + "epoch": 1.9983775013520821, + "grad_norm": 3.125, + "learning_rate": 0.014285693114618074, + "loss": 3.3214, + "mean_token_accuracy": 0.3743990361690521, + "num_tokens": 3777997265.0, + "step": 7390 + }, + { + "epoch": 1.9986479177934018, + "grad_norm": 32.5, + "learning_rate": 0.01428422538851988, + "loss": 11.9835, + "mean_token_accuracy": 0.0009595418232493103, + "num_tokens": 3778406438.0, + "step": 7391 + }, + { + "epoch": 1.9989183342347214, + "grad_norm": 10.75, + "learning_rate": 0.014282757561651996, + "loss": 3.9559, + "mean_token_accuracy": 0.29259932041168213, + "num_tokens": 3778930661.0, + "step": 7392 + }, + { + "epoch": 1.999188750676041, + "grad_norm": 2.75, + "learning_rate": 0.014281289634059455, + "loss": 3.4656, + "mean_token_accuracy": 0.4064129590988159, + "num_tokens": 3779389501.0, + "step": 7393 + }, + { + "epoch": 1.9994591671173607, + "grad_norm": 2.171875, + "learning_rate": 0.0142798216057873, + "loss": 3.3679, + "mean_token_accuracy": 0.39188700914382935, + "num_tokens": 3779862663.0, + "step": 7394 + }, + { + "epoch": 1.9997295835586804, + "grad_norm": 2.796875, + "learning_rate": 0.014278353476880569, + "loss": 3.2418, + "mean_token_accuracy": 0.38526809215545654, + "num_tokens": 3780386814.0, + "step": 7395 + }, + { + "epoch": 2.0, + "grad_norm": 2.90625, + "learning_rate": 0.014276885247384317, + "loss": 3.3425, + "mean_token_accuracy": 0.3956184387207031, + "num_tokens": 3780648958.0, + "step": 7396 + }, + { + "epoch": 2.0002704164413196, + "grad_norm": 3.15625, + "learning_rate": 0.014275416917343582, + "loss": 3.3994, + "mean_token_accuracy": 0.3737051486968994, + "num_tokens": 3781130527.0, + "step": 7397 + }, + { + "epoch": 2.0005408328826393, + "grad_norm": 2.0625, + "learning_rate": 0.014273948486803427, + "loss": 3.0694, + "mean_token_accuracy": 0.3971785306930542, + "num_tokens": 3781654794.0, + "step": 7398 + }, + { + "epoch": 2.000811249323959, + "grad_norm": 3.15625, + "learning_rate": 0.014272479955808905, + "loss": 3.4375, + "mean_token_accuracy": 0.37231147289276123, + "num_tokens": 3782170966.0, + "step": 7399 + }, + { + "epoch": 2.0010816657652786, + "grad_norm": 2.265625, + "learning_rate": 0.014271011324405076, + "loss": 3.2243, + "mean_token_accuracy": 0.4033588171005249, + "num_tokens": 3782650837.0, + "step": 7400 + }, + { + "epoch": 2.001352082206598, + "grad_norm": 3.0, + "learning_rate": 0.014269542592636996, + "loss": 3.2701, + "mean_token_accuracy": 0.3813706636428833, + "num_tokens": 3783175120.0, + "step": 7401 + }, + { + "epoch": 2.001622498647918, + "grad_norm": 2.390625, + "learning_rate": 0.014268073760549739, + "loss": 3.245, + "mean_token_accuracy": 0.3867069184780121, + "num_tokens": 3783699396.0, + "step": 7402 + }, + { + "epoch": 2.0018929150892375, + "grad_norm": 2.5625, + "learning_rate": 0.014266604828188366, + "loss": 3.1293, + "mean_token_accuracy": 0.3950711786746979, + "num_tokens": 3784223561.0, + "step": 7403 + }, + { + "epoch": 2.002163331530557, + "grad_norm": 2.125, + "learning_rate": 0.01426513579559795, + "loss": 3.1084, + "mean_token_accuracy": 0.4138886332511902, + "num_tokens": 3784688430.0, + "step": 7404 + }, + { + "epoch": 2.002433747971877, + "grad_norm": 2.875, + "learning_rate": 0.014263666662823563, + "loss": 3.1648, + "mean_token_accuracy": 0.42493993043899536, + "num_tokens": 3785152738.0, + "step": 7405 + }, + { + "epoch": 2.0027041644131964, + "grad_norm": 2.625, + "learning_rate": 0.014262197429910289, + "loss": 3.0695, + "mean_token_accuracy": 0.4225049316883087, + "num_tokens": 3785629831.0, + "step": 7406 + }, + { + "epoch": 2.002974580854516, + "grad_norm": 2.875, + "learning_rate": 0.014260728096903205, + "loss": 3.1113, + "mean_token_accuracy": 0.4050661325454712, + "num_tokens": 3786154045.0, + "step": 7407 + }, + { + "epoch": 2.0032449972958357, + "grad_norm": 3.0, + "learning_rate": 0.01425925866384739, + "loss": 3.1469, + "mean_token_accuracy": 0.3722318410873413, + "num_tokens": 3786678260.0, + "step": 7408 + }, + { + "epoch": 2.0035154137371554, + "grad_norm": 3.4375, + "learning_rate": 0.014257789130787938, + "loss": 3.4784, + "mean_token_accuracy": 0.3512142300605774, + "num_tokens": 3787202415.0, + "step": 7409 + }, + { + "epoch": 2.003785830178475, + "grad_norm": 5.3125, + "learning_rate": 0.014256319497769931, + "loss": 3.265, + "mean_token_accuracy": 0.3546730875968933, + "num_tokens": 3787726663.0, + "step": 7410 + }, + { + "epoch": 2.0040562466197946, + "grad_norm": 45.5, + "learning_rate": 0.014254849764838469, + "loss": 30.1357, + "mean_token_accuracy": 0.02058529481291771, + "num_tokens": 3788250826.0, + "step": 7411 + }, + { + "epoch": 2.0043266630611143, + "grad_norm": 3.859375, + "learning_rate": 0.01425337993203864, + "loss": 3.455, + "mean_token_accuracy": 0.3517741560935974, + "num_tokens": 3788775044.0, + "step": 7412 + }, + { + "epoch": 2.004597079502434, + "grad_norm": 2.28125, + "learning_rate": 0.014251909999415554, + "loss": 3.2785, + "mean_token_accuracy": 0.35855191946029663, + "num_tokens": 3789274558.0, + "step": 7413 + }, + { + "epoch": 2.0048674959437536, + "grad_norm": 2.96875, + "learning_rate": 0.014250439967014299, + "loss": 3.1448, + "mean_token_accuracy": 0.3921315670013428, + "num_tokens": 3789798824.0, + "step": 7414 + }, + { + "epoch": 2.005137912385073, + "grad_norm": 2.84375, + "learning_rate": 0.01424896983487999, + "loss": 3.3722, + "mean_token_accuracy": 0.3938712477684021, + "num_tokens": 3790323018.0, + "step": 7415 + }, + { + "epoch": 2.005408328826393, + "grad_norm": 3.28125, + "learning_rate": 0.014247499603057734, + "loss": 3.4077, + "mean_token_accuracy": 0.3949596881866455, + "num_tokens": 3790809576.0, + "step": 7416 + }, + { + "epoch": 2.005678745267712, + "grad_norm": 2.734375, + "learning_rate": 0.014246029271592636, + "loss": 3.2426, + "mean_token_accuracy": 0.37340980768203735, + "num_tokens": 3791333853.0, + "step": 7417 + }, + { + "epoch": 2.0059491617090317, + "grad_norm": 2.46875, + "learning_rate": 0.014244558840529817, + "loss": 3.315, + "mean_token_accuracy": 0.3967092037200928, + "num_tokens": 3791858139.0, + "step": 7418 + }, + { + "epoch": 2.0062195781503513, + "grad_norm": 3.109375, + "learning_rate": 0.014243088309914387, + "loss": 3.1935, + "mean_token_accuracy": 0.3605040907859802, + "num_tokens": 3792382364.0, + "step": 7419 + }, + { + "epoch": 2.006489994591671, + "grad_norm": 2.984375, + "learning_rate": 0.014241617679791475, + "loss": 3.3208, + "mean_token_accuracy": 0.3809261620044708, + "num_tokens": 3792889648.0, + "step": 7420 + }, + { + "epoch": 2.0067604110329906, + "grad_norm": 2.875, + "learning_rate": 0.014240146950206199, + "loss": 2.9489, + "mean_token_accuracy": 0.4004846215248108, + "num_tokens": 3793413785.0, + "step": 7421 + }, + { + "epoch": 2.0070308274743103, + "grad_norm": 2.359375, + "learning_rate": 0.014238676121203685, + "loss": 3.2603, + "mean_token_accuracy": 0.40071752667427063, + "num_tokens": 3793930390.0, + "step": 7422 + }, + { + "epoch": 2.00730124391563, + "grad_norm": 3.796875, + "learning_rate": 0.014237205192829063, + "loss": 2.7644, + "mean_token_accuracy": 0.41637516021728516, + "num_tokens": 3794358753.0, + "step": 7423 + }, + { + "epoch": 2.0075716603569496, + "grad_norm": 2.84375, + "learning_rate": 0.014235734165127465, + "loss": 3.243, + "mean_token_accuracy": 0.40489661693573, + "num_tokens": 3794829488.0, + "step": 7424 + }, + { + "epoch": 2.007842076798269, + "grad_norm": 3.734375, + "learning_rate": 0.014234263038144028, + "loss": 3.2065, + "mean_token_accuracy": 0.40129554271698, + "num_tokens": 3795353594.0, + "step": 7425 + }, + { + "epoch": 2.008112493239589, + "grad_norm": 2.75, + "learning_rate": 0.014232791811923889, + "loss": 3.3011, + "mean_token_accuracy": 0.39216238260269165, + "num_tokens": 3795877829.0, + "step": 7426 + }, + { + "epoch": 2.0083829096809085, + "grad_norm": 3.125, + "learning_rate": 0.014231320486512192, + "loss": 3.1811, + "mean_token_accuracy": 0.39660125970840454, + "num_tokens": 3796401985.0, + "step": 7427 + }, + { + "epoch": 2.008653326122228, + "grad_norm": 2.265625, + "learning_rate": 0.014229849061954077, + "loss": 3.157, + "mean_token_accuracy": 0.413421630859375, + "num_tokens": 3796865501.0, + "step": 7428 + }, + { + "epoch": 2.0089237425635478, + "grad_norm": 3.421875, + "learning_rate": 0.014228377538294697, + "loss": 3.3905, + "mean_token_accuracy": 0.3933693468570709, + "num_tokens": 3797389735.0, + "step": 7429 + }, + { + "epoch": 2.0091941590048674, + "grad_norm": 2.78125, + "learning_rate": 0.0142269059155792, + "loss": 3.1607, + "mean_token_accuracy": 0.40933114290237427, + "num_tokens": 3797913870.0, + "step": 7430 + }, + { + "epoch": 2.009464575446187, + "grad_norm": 12.4375, + "learning_rate": 0.014225434193852738, + "loss": 10.325, + "mean_token_accuracy": 0.00775360781699419, + "num_tokens": 3798406137.0, + "step": 7431 + }, + { + "epoch": 2.0097349918875067, + "grad_norm": 8.5, + "learning_rate": 0.014223962373160471, + "loss": 3.6765, + "mean_token_accuracy": 0.3282731771469116, + "num_tokens": 3798930385.0, + "step": 7432 + }, + { + "epoch": 2.0100054083288263, + "grad_norm": 2.640625, + "learning_rate": 0.014222490453547556, + "loss": 3.0573, + "mean_token_accuracy": 0.40639370679855347, + "num_tokens": 3799414708.0, + "step": 7433 + }, + { + "epoch": 2.010275824770146, + "grad_norm": 3.125, + "learning_rate": 0.014221018435059157, + "loss": 3.0199, + "mean_token_accuracy": 0.42594462633132935, + "num_tokens": 3799938839.0, + "step": 7434 + }, + { + "epoch": 2.0105462412114656, + "grad_norm": 2.0, + "learning_rate": 0.014219546317740441, + "loss": 2.9122, + "mean_token_accuracy": 0.41203105449676514, + "num_tokens": 3800462923.0, + "step": 7435 + }, + { + "epoch": 2.0108166576527853, + "grad_norm": 2.59375, + "learning_rate": 0.014218074101636575, + "loss": 3.248, + "mean_token_accuracy": 0.3948274254798889, + "num_tokens": 3800987047.0, + "step": 7436 + }, + { + "epoch": 2.011087074094105, + "grad_norm": 2.515625, + "learning_rate": 0.014216601786792734, + "loss": 3.2359, + "mean_token_accuracy": 0.38835152983665466, + "num_tokens": 3801511260.0, + "step": 7437 + }, + { + "epoch": 2.0113574905354246, + "grad_norm": 3.015625, + "learning_rate": 0.014215129373254089, + "loss": 3.1048, + "mean_token_accuracy": 0.3904969096183777, + "num_tokens": 3802035375.0, + "step": 7438 + }, + { + "epoch": 2.011627906976744, + "grad_norm": 3.546875, + "learning_rate": 0.014213656861065823, + "loss": 3.3442, + "mean_token_accuracy": 0.37304604053497314, + "num_tokens": 3802538846.0, + "step": 7439 + }, + { + "epoch": 2.011898323418064, + "grad_norm": 2.765625, + "learning_rate": 0.014212184250273105, + "loss": 3.2362, + "mean_token_accuracy": 0.3897646963596344, + "num_tokens": 3803024615.0, + "step": 7440 + }, + { + "epoch": 2.0121687398593835, + "grad_norm": 2.484375, + "learning_rate": 0.014210711540921135, + "loss": 2.9236, + "mean_token_accuracy": 0.41352829337120056, + "num_tokens": 3803548803.0, + "step": 7441 + }, + { + "epoch": 2.012439156300703, + "grad_norm": 3.59375, + "learning_rate": 0.014209238733055091, + "loss": 3.2579, + "mean_token_accuracy": 0.3903808295726776, + "num_tokens": 3804072936.0, + "step": 7442 + }, + { + "epoch": 2.012709572742023, + "grad_norm": 3.71875, + "learning_rate": 0.014207765826720167, + "loss": 3.1446, + "mean_token_accuracy": 0.41387835144996643, + "num_tokens": 3804597190.0, + "step": 7443 + }, + { + "epoch": 2.0129799891833424, + "grad_norm": 2.90625, + "learning_rate": 0.014206292821961553, + "loss": 3.1835, + "mean_token_accuracy": 0.38515493273735046, + "num_tokens": 3805075668.0, + "step": 7444 + }, + { + "epoch": 2.013250405624662, + "grad_norm": 2.546875, + "learning_rate": 0.014204819718824447, + "loss": 3.2831, + "mean_token_accuracy": 0.3788246512413025, + "num_tokens": 3805599874.0, + "step": 7445 + }, + { + "epoch": 2.0135208220659817, + "grad_norm": 3.25, + "learning_rate": 0.014203346517354045, + "loss": 3.1771, + "mean_token_accuracy": 0.35995227098464966, + "num_tokens": 3806076643.0, + "step": 7446 + }, + { + "epoch": 2.0137912385073014, + "grad_norm": 2.796875, + "learning_rate": 0.014201873217595551, + "loss": 3.2053, + "mean_token_accuracy": 0.38285672664642334, + "num_tokens": 3806587930.0, + "step": 7447 + }, + { + "epoch": 2.014061654948621, + "grad_norm": 3.265625, + "learning_rate": 0.014200399819594174, + "loss": 3.2925, + "mean_token_accuracy": 0.37985318899154663, + "num_tokens": 3807112139.0, + "step": 7448 + }, + { + "epoch": 2.0143320713899406, + "grad_norm": 3.25, + "learning_rate": 0.014198926323395117, + "loss": 3.3422, + "mean_token_accuracy": 0.4021511673927307, + "num_tokens": 3807636368.0, + "step": 7449 + }, + { + "epoch": 2.0146024878312603, + "grad_norm": 3.5625, + "learning_rate": 0.014197452729043592, + "loss": 3.1408, + "mean_token_accuracy": 0.4074442982673645, + "num_tokens": 3808150757.0, + "step": 7450 + }, + { + "epoch": 2.01487290427258, + "grad_norm": 116.5, + "learning_rate": 0.014195979036584812, + "loss": 10.463, + "mean_token_accuracy": 0.01989123970270157, + "num_tokens": 3808674679.0, + "step": 7451 + }, + { + "epoch": 2.0151433207138996, + "grad_norm": 6.84375, + "learning_rate": 0.014194505246064, + "loss": 3.5206, + "mean_token_accuracy": 0.3368282914161682, + "num_tokens": 3809198907.0, + "step": 7452 + }, + { + "epoch": 2.015413737155219, + "grad_norm": 2.296875, + "learning_rate": 0.014193031357526371, + "loss": 3.259, + "mean_token_accuracy": 0.4232858419418335, + "num_tokens": 3809579739.0, + "step": 7453 + }, + { + "epoch": 2.015684153596539, + "grad_norm": 2.390625, + "learning_rate": 0.014191557371017152, + "loss": 3.1355, + "mean_token_accuracy": 0.41438618302345276, + "num_tokens": 3810044563.0, + "step": 7454 + }, + { + "epoch": 2.0159545700378585, + "grad_norm": 3.578125, + "learning_rate": 0.014190083286581565, + "loss": 3.07, + "mean_token_accuracy": 0.3996155261993408, + "num_tokens": 3810568804.0, + "step": 7455 + }, + { + "epoch": 2.016224986479178, + "grad_norm": 2.390625, + "learning_rate": 0.014188609104264843, + "loss": 3.0434, + "mean_token_accuracy": 0.41371870040893555, + "num_tokens": 3811093024.0, + "step": 7456 + }, + { + "epoch": 2.016495402920498, + "grad_norm": 3.125, + "learning_rate": 0.014187134824112213, + "loss": 3.2553, + "mean_token_accuracy": 0.38921552896499634, + "num_tokens": 3811617254.0, + "step": 7457 + }, + { + "epoch": 2.016765819361817, + "grad_norm": 3.109375, + "learning_rate": 0.01418566044616892, + "loss": 3.3744, + "mean_token_accuracy": 0.3860359191894531, + "num_tokens": 3812141395.0, + "step": 7458 + }, + { + "epoch": 2.0170362358031366, + "grad_norm": 2.84375, + "learning_rate": 0.014184185970480193, + "loss": 3.0783, + "mean_token_accuracy": 0.4042223393917084, + "num_tokens": 3812665600.0, + "step": 7459 + }, + { + "epoch": 2.0173066522444563, + "grad_norm": 2.59375, + "learning_rate": 0.014182711397091275, + "loss": 3.1443, + "mean_token_accuracy": 0.38927602767944336, + "num_tokens": 3813189646.0, + "step": 7460 + }, + { + "epoch": 2.017577068685776, + "grad_norm": 2.203125, + "learning_rate": 0.014181236726047413, + "loss": 3.1701, + "mean_token_accuracy": 0.40103596448898315, + "num_tokens": 3813713798.0, + "step": 7461 + }, + { + "epoch": 2.0178474851270956, + "grad_norm": 2.75, + "learning_rate": 0.01417976195739385, + "loss": 3.2329, + "mean_token_accuracy": 0.3963615894317627, + "num_tokens": 3814238078.0, + "step": 7462 + }, + { + "epoch": 2.018117901568415, + "grad_norm": 2.15625, + "learning_rate": 0.01417828709117584, + "loss": 3.2325, + "mean_token_accuracy": 0.3984634578227997, + "num_tokens": 3814762275.0, + "step": 7463 + }, + { + "epoch": 2.018388318009735, + "grad_norm": 2.921875, + "learning_rate": 0.014176812127438638, + "loss": 3.2359, + "mean_token_accuracy": 0.38152843713760376, + "num_tokens": 3815286508.0, + "step": 7464 + }, + { + "epoch": 2.0186587344510545, + "grad_norm": 2.71875, + "learning_rate": 0.014175337066227499, + "loss": 3.0508, + "mean_token_accuracy": 0.40751802921295166, + "num_tokens": 3815797705.0, + "step": 7465 + }, + { + "epoch": 2.018929150892374, + "grad_norm": 3.25, + "learning_rate": 0.014173861907587678, + "loss": 3.1264, + "mean_token_accuracy": 0.4116600751876831, + "num_tokens": 3816214135.0, + "step": 7466 + }, + { + "epoch": 2.0191995673336938, + "grad_norm": 2.59375, + "learning_rate": 0.01417238665156444, + "loss": 3.1489, + "mean_token_accuracy": 0.40520113706588745, + "num_tokens": 3816738221.0, + "step": 7467 + }, + { + "epoch": 2.0194699837750134, + "grad_norm": 2.71875, + "learning_rate": 0.014170911298203047, + "loss": 3.1497, + "mean_token_accuracy": 0.3835408091545105, + "num_tokens": 3817262337.0, + "step": 7468 + }, + { + "epoch": 2.019740400216333, + "grad_norm": 2.65625, + "learning_rate": 0.014169435847548774, + "loss": 3.1692, + "mean_token_accuracy": 0.4040158987045288, + "num_tokens": 3817786548.0, + "step": 7469 + }, + { + "epoch": 2.0200108166576527, + "grad_norm": 2.921875, + "learning_rate": 0.014167960299646887, + "loss": 2.8688, + "mean_token_accuracy": 0.42208728194236755, + "num_tokens": 3818310777.0, + "step": 7470 + }, + { + "epoch": 2.0202812330989723, + "grad_norm": 93.0, + "learning_rate": 0.014166484654542663, + "loss": 13.5049, + "mean_token_accuracy": 0.004353655967861414, + "num_tokens": 3818771655.0, + "step": 7471 + }, + { + "epoch": 2.020551649540292, + "grad_norm": 5.5625, + "learning_rate": 0.01416500891228138, + "loss": 3.3992, + "mean_token_accuracy": 0.38664358854293823, + "num_tokens": 3819295857.0, + "step": 7472 + }, + { + "epoch": 2.0208220659816116, + "grad_norm": 2.375, + "learning_rate": 0.014163533072908312, + "loss": 3.3834, + "mean_token_accuracy": 0.37812182307243347, + "num_tokens": 3819789928.0, + "step": 7473 + }, + { + "epoch": 2.0210924824229313, + "grad_norm": 2.265625, + "learning_rate": 0.014162057136468748, + "loss": 3.0491, + "mean_token_accuracy": 0.3914716839790344, + "num_tokens": 3820314096.0, + "step": 7474 + }, + { + "epoch": 2.021362898864251, + "grad_norm": 2.59375, + "learning_rate": 0.014160581103007972, + "loss": 3.0514, + "mean_token_accuracy": 0.41741517186164856, + "num_tokens": 3820735758.0, + "step": 7475 + }, + { + "epoch": 2.0216333153055706, + "grad_norm": 3.140625, + "learning_rate": 0.01415910497257127, + "loss": 3.4001, + "mean_token_accuracy": 0.391557902097702, + "num_tokens": 3821208612.0, + "step": 7476 + }, + { + "epoch": 2.02190373174689, + "grad_norm": 3.71875, + "learning_rate": 0.01415762874520394, + "loss": 3.106, + "mean_token_accuracy": 0.4031822085380554, + "num_tokens": 3821732726.0, + "step": 7477 + }, + { + "epoch": 2.02217414818821, + "grad_norm": 2.5625, + "learning_rate": 0.014156152420951274, + "loss": 2.9883, + "mean_token_accuracy": 0.40592074394226074, + "num_tokens": 3822256955.0, + "step": 7478 + }, + { + "epoch": 2.0224445646295295, + "grad_norm": 2.828125, + "learning_rate": 0.014154675999858571, + "loss": 3.2131, + "mean_token_accuracy": 0.3916718363761902, + "num_tokens": 3822767504.0, + "step": 7479 + }, + { + "epoch": 2.022714981070849, + "grad_norm": 2.59375, + "learning_rate": 0.014153199481971128, + "loss": 2.8957, + "mean_token_accuracy": 0.4256856143474579, + "num_tokens": 3823291674.0, + "step": 7480 + }, + { + "epoch": 2.0229853975121688, + "grad_norm": 2.890625, + "learning_rate": 0.014151722867334255, + "loss": 3.22, + "mean_token_accuracy": 0.40738770365715027, + "num_tokens": 3823815877.0, + "step": 7481 + }, + { + "epoch": 2.0232558139534884, + "grad_norm": 3.796875, + "learning_rate": 0.014150246155993253, + "loss": 3.2508, + "mean_token_accuracy": 0.40553146600723267, + "num_tokens": 3824291714.0, + "step": 7482 + }, + { + "epoch": 2.023526230394808, + "grad_norm": 3.6875, + "learning_rate": 0.014148769347993435, + "loss": 3.1064, + "mean_token_accuracy": 0.36855486035346985, + "num_tokens": 3824815991.0, + "step": 7483 + }, + { + "epoch": 2.0237966468361277, + "grad_norm": 1.9375, + "learning_rate": 0.014147292443380116, + "loss": 3.3361, + "mean_token_accuracy": 0.3946921229362488, + "num_tokens": 3825340221.0, + "step": 7484 + }, + { + "epoch": 2.0240670632774473, + "grad_norm": 3.515625, + "learning_rate": 0.014145815442198608, + "loss": 3.3941, + "mean_token_accuracy": 0.3737742006778717, + "num_tokens": 3825864318.0, + "step": 7485 + }, + { + "epoch": 2.024337479718767, + "grad_norm": 2.921875, + "learning_rate": 0.014144338344494231, + "loss": 3.1913, + "mean_token_accuracy": 0.3949851393699646, + "num_tokens": 3826388556.0, + "step": 7486 + }, + { + "epoch": 2.0246078961600866, + "grad_norm": 2.625, + "learning_rate": 0.014142861150312306, + "loss": 3.1099, + "mean_token_accuracy": 0.40460366010665894, + "num_tokens": 3826895969.0, + "step": 7487 + }, + { + "epoch": 2.0248783126014063, + "grad_norm": 2.9375, + "learning_rate": 0.014141383859698162, + "loss": 3.07, + "mean_token_accuracy": 0.3970376253128052, + "num_tokens": 3827420245.0, + "step": 7488 + }, + { + "epoch": 2.025148729042726, + "grad_norm": 3.03125, + "learning_rate": 0.01413990647269712, + "loss": 3.2135, + "mean_token_accuracy": 0.3927795886993408, + "num_tokens": 3827900099.0, + "step": 7489 + }, + { + "epoch": 2.0254191454840456, + "grad_norm": 2.8125, + "learning_rate": 0.01413842898935451, + "loss": 3.3327, + "mean_token_accuracy": 0.3953189253807068, + "num_tokens": 3828424203.0, + "step": 7490 + }, + { + "epoch": 2.025689561925365, + "grad_norm": 266.0, + "learning_rate": 0.014136951409715673, + "loss": 24.3231, + "mean_token_accuracy": 5.580690049100667e-05, + "num_tokens": 3828948461.0, + "step": 7491 + }, + { + "epoch": 2.025959978366685, + "grad_norm": 5.71875, + "learning_rate": 0.014135473733825944, + "loss": 3.6657, + "mean_token_accuracy": 0.3276084065437317, + "num_tokens": 3829438459.0, + "step": 7492 + }, + { + "epoch": 2.0262303948080045, + "grad_norm": 2.109375, + "learning_rate": 0.014133995961730652, + "loss": 3.3623, + "mean_token_accuracy": 0.38911890983581543, + "num_tokens": 3829962623.0, + "step": 7493 + }, + { + "epoch": 2.026500811249324, + "grad_norm": 3.5625, + "learning_rate": 0.014132518093475157, + "loss": 3.4407, + "mean_token_accuracy": 0.37986788153648376, + "num_tokens": 3830426338.0, + "step": 7494 + }, + { + "epoch": 2.0267712276906438, + "grad_norm": 3.375, + "learning_rate": 0.014131040129104789, + "loss": 3.3095, + "mean_token_accuracy": 0.39174655079841614, + "num_tokens": 3830950618.0, + "step": 7495 + }, + { + "epoch": 2.0270416441319634, + "grad_norm": 3.453125, + "learning_rate": 0.014129562068664904, + "loss": 3.3595, + "mean_token_accuracy": 0.3759790062904358, + "num_tokens": 3831474846.0, + "step": 7496 + }, + { + "epoch": 2.027312060573283, + "grad_norm": 3.265625, + "learning_rate": 0.014128083912200851, + "loss": 3.3565, + "mean_token_accuracy": 0.370751291513443, + "num_tokens": 3831999077.0, + "step": 7497 + }, + { + "epoch": 2.0275824770146027, + "grad_norm": 2.75, + "learning_rate": 0.014126605659757983, + "loss": 3.2518, + "mean_token_accuracy": 0.37164461612701416, + "num_tokens": 3832523163.0, + "step": 7498 + }, + { + "epoch": 2.027852893455922, + "grad_norm": 2.6875, + "learning_rate": 0.014125127311381658, + "loss": 3.0526, + "mean_token_accuracy": 0.4019334316253662, + "num_tokens": 3833047148.0, + "step": 7499 + }, + { + "epoch": 2.0281233098972415, + "grad_norm": 3.296875, + "learning_rate": 0.014123648867117238, + "loss": 3.1394, + "mean_token_accuracy": 0.3760625123977661, + "num_tokens": 3833571403.0, + "step": 7500 + }, + { + "epoch": 2.028393726338561, + "grad_norm": 2.859375, + "learning_rate": 0.014122170327010087, + "loss": 3.0045, + "mean_token_accuracy": 0.39762741327285767, + "num_tokens": 3834038183.0, + "step": 7501 + }, + { + "epoch": 2.028664142779881, + "grad_norm": 2.75, + "learning_rate": 0.01412069169110557, + "loss": 3.2367, + "mean_token_accuracy": 0.3890809118747711, + "num_tokens": 3834562329.0, + "step": 7502 + }, + { + "epoch": 2.0289345592212005, + "grad_norm": 3.03125, + "learning_rate": 0.014119212959449049, + "loss": 3.1658, + "mean_token_accuracy": 0.38245731592178345, + "num_tokens": 3835078304.0, + "step": 7503 + }, + { + "epoch": 2.02920497566252, + "grad_norm": 2.40625, + "learning_rate": 0.014117734132085903, + "loss": 2.9918, + "mean_token_accuracy": 0.4157189428806305, + "num_tokens": 3835602544.0, + "step": 7504 + }, + { + "epoch": 2.0294753921038398, + "grad_norm": 2.609375, + "learning_rate": 0.014116255209061505, + "loss": 3.0125, + "mean_token_accuracy": 0.40668782591819763, + "num_tokens": 3836126709.0, + "step": 7505 + }, + { + "epoch": 2.0297458085451594, + "grad_norm": 2.375, + "learning_rate": 0.014114776190421235, + "loss": 3.2717, + "mean_token_accuracy": 0.3754955530166626, + "num_tokens": 3836650971.0, + "step": 7506 + }, + { + "epoch": 2.030016224986479, + "grad_norm": 3.140625, + "learning_rate": 0.014113297076210469, + "loss": 3.0932, + "mean_token_accuracy": 0.3843432068824768, + "num_tokens": 3837159029.0, + "step": 7507 + }, + { + "epoch": 2.0302866414277987, + "grad_norm": 2.578125, + "learning_rate": 0.014111817866474597, + "loss": 3.0396, + "mean_token_accuracy": 0.3884757161140442, + "num_tokens": 3837683277.0, + "step": 7508 + }, + { + "epoch": 2.0305570578691183, + "grad_norm": 3.109375, + "learning_rate": 0.014110338561259, + "loss": 3.0459, + "mean_token_accuracy": 0.39644527435302734, + "num_tokens": 3838207517.0, + "step": 7509 + }, + { + "epoch": 2.030827474310438, + "grad_norm": 2.328125, + "learning_rate": 0.014108859160609064, + "loss": 3.0292, + "mean_token_accuracy": 0.417389452457428, + "num_tokens": 3838673890.0, + "step": 7510 + }, + { + "epoch": 2.0310978907517576, + "grad_norm": 53.25, + "learning_rate": 0.014107379664570192, + "loss": 12.9192, + "mean_token_accuracy": 0.00394449383020401, + "num_tokens": 3839198095.0, + "step": 7511 + }, + { + "epoch": 2.0313683071930773, + "grad_norm": 7.84375, + "learning_rate": 0.014105900073187771, + "loss": 3.4335, + "mean_token_accuracy": 0.37882333993911743, + "num_tokens": 3839722327.0, + "step": 7512 + }, + { + "epoch": 2.031638723634397, + "grad_norm": 2.6875, + "learning_rate": 0.014104420386507202, + "loss": 3.4106, + "mean_token_accuracy": 0.40615206956863403, + "num_tokens": 3840203364.0, + "step": 7513 + }, + { + "epoch": 2.0319091400757165, + "grad_norm": 2.96875, + "learning_rate": 0.014102940604573889, + "loss": 3.4654, + "mean_token_accuracy": 0.36030691862106323, + "num_tokens": 3840727625.0, + "step": 7514 + }, + { + "epoch": 2.032179556517036, + "grad_norm": 3.0, + "learning_rate": 0.014101460727433233, + "loss": 3.3993, + "mean_token_accuracy": 0.4147482216358185, + "num_tokens": 3841149770.0, + "step": 7515 + }, + { + "epoch": 2.032449972958356, + "grad_norm": 2.78125, + "learning_rate": 0.014099980755130637, + "loss": 3.3925, + "mean_token_accuracy": 0.3799828290939331, + "num_tokens": 3841670442.0, + "step": 7516 + }, + { + "epoch": 2.0327203893996755, + "grad_norm": 3.265625, + "learning_rate": 0.01409850068771152, + "loss": 3.2821, + "mean_token_accuracy": 0.3935002386569977, + "num_tokens": 3842194585.0, + "step": 7517 + }, + { + "epoch": 2.032990805840995, + "grad_norm": 2.515625, + "learning_rate": 0.014097020525221285, + "loss": 3.1102, + "mean_token_accuracy": 0.40038883686065674, + "num_tokens": 3842718727.0, + "step": 7518 + }, + { + "epoch": 2.0332612222823148, + "grad_norm": 2.859375, + "learning_rate": 0.014095540267705353, + "loss": 3.2033, + "mean_token_accuracy": 0.389457643032074, + "num_tokens": 3843188886.0, + "step": 7519 + }, + { + "epoch": 2.0335316387236344, + "grad_norm": 2.5, + "learning_rate": 0.014094059915209144, + "loss": 3.2114, + "mean_token_accuracy": 0.384410560131073, + "num_tokens": 3843713128.0, + "step": 7520 + }, + { + "epoch": 2.033802055164954, + "grad_norm": 4.53125, + "learning_rate": 0.014092579467778078, + "loss": 3.342, + "mean_token_accuracy": 0.37222278118133545, + "num_tokens": 3844203036.0, + "step": 7521 + }, + { + "epoch": 2.0340724716062737, + "grad_norm": 2.4375, + "learning_rate": 0.01409109892545758, + "loss": 2.9812, + "mean_token_accuracy": 0.3962516188621521, + "num_tokens": 3844727222.0, + "step": 7522 + }, + { + "epoch": 2.0343428880475933, + "grad_norm": 2.875, + "learning_rate": 0.014089618288293073, + "loss": 3.2534, + "mean_token_accuracy": 0.3753342032432556, + "num_tokens": 3845251434.0, + "step": 7523 + }, + { + "epoch": 2.034613304488913, + "grad_norm": 2.53125, + "learning_rate": 0.014088137556329997, + "loss": 3.2169, + "mean_token_accuracy": 0.3918589651584625, + "num_tokens": 3845760650.0, + "step": 7524 + }, + { + "epoch": 2.0348837209302326, + "grad_norm": 2.75, + "learning_rate": 0.014086656729613772, + "loss": 3.1404, + "mean_token_accuracy": 0.3969309329986572, + "num_tokens": 3846284655.0, + "step": 7525 + }, + { + "epoch": 2.0351541373715523, + "grad_norm": 2.625, + "learning_rate": 0.014085175808189845, + "loss": 3.1062, + "mean_token_accuracy": 0.3978422284126282, + "num_tokens": 3846808863.0, + "step": 7526 + }, + { + "epoch": 2.035424553812872, + "grad_norm": 3.1875, + "learning_rate": 0.014083694792103653, + "loss": 3.1404, + "mean_token_accuracy": 0.394142210483551, + "num_tokens": 3847333139.0, + "step": 7527 + }, + { + "epoch": 2.0356949702541915, + "grad_norm": 3.625, + "learning_rate": 0.014082213681400634, + "loss": 2.9309, + "mean_token_accuracy": 0.4193989634513855, + "num_tokens": 3847857384.0, + "step": 7528 + }, + { + "epoch": 2.035965386695511, + "grad_norm": 3.078125, + "learning_rate": 0.014080732476126234, + "loss": 3.229, + "mean_token_accuracy": 0.41498106718063354, + "num_tokens": 3848381647.0, + "step": 7529 + }, + { + "epoch": 2.036235803136831, + "grad_norm": 4.09375, + "learning_rate": 0.014079251176325906, + "loss": 3.1917, + "mean_token_accuracy": 0.3679121434688568, + "num_tokens": 3848905711.0, + "step": 7530 + }, + { + "epoch": 2.0365062195781505, + "grad_norm": 20.375, + "learning_rate": 0.01407776978204509, + "loss": 11.5738, + "mean_token_accuracy": 0.0028686141595244408, + "num_tokens": 3849396153.0, + "step": 7531 + }, + { + "epoch": 2.03677663601947, + "grad_norm": 8.5, + "learning_rate": 0.01407628829332925, + "loss": 3.7954, + "mean_token_accuracy": 0.33274903893470764, + "num_tokens": 3849920415.0, + "step": 7532 + }, + { + "epoch": 2.0370470524607898, + "grad_norm": 2.421875, + "learning_rate": 0.014074806710223837, + "loss": 3.4928, + "mean_token_accuracy": 0.3523081839084625, + "num_tokens": 3850444579.0, + "step": 7533 + }, + { + "epoch": 2.0373174689021094, + "grad_norm": 2.65625, + "learning_rate": 0.014073325032774314, + "loss": 3.2757, + "mean_token_accuracy": 0.3854562044143677, + "num_tokens": 3850943684.0, + "step": 7534 + }, + { + "epoch": 2.037587885343429, + "grad_norm": 3.640625, + "learning_rate": 0.01407184326102614, + "loss": 3.3556, + "mean_token_accuracy": 0.3916481137275696, + "num_tokens": 3851363379.0, + "step": 7535 + }, + { + "epoch": 2.0378583017847487, + "grad_norm": 2.90625, + "learning_rate": 0.014070361395024778, + "loss": 3.3705, + "mean_token_accuracy": 0.3637816309928894, + "num_tokens": 3851887474.0, + "step": 7536 + }, + { + "epoch": 2.0381287182260683, + "grad_norm": 3.375, + "learning_rate": 0.014068879434815707, + "loss": 3.3663, + "mean_token_accuracy": 0.3795308768749237, + "num_tokens": 3852381690.0, + "step": 7537 + }, + { + "epoch": 2.038399134667388, + "grad_norm": 2.6875, + "learning_rate": 0.014067397380444381, + "loss": 3.3379, + "mean_token_accuracy": 0.3873448967933655, + "num_tokens": 3852860596.0, + "step": 7538 + }, + { + "epoch": 2.0386695511087076, + "grad_norm": 2.4375, + "learning_rate": 0.014065915231956286, + "loss": 3.0567, + "mean_token_accuracy": 0.4023604094982147, + "num_tokens": 3853384854.0, + "step": 7539 + }, + { + "epoch": 2.038939967550027, + "grad_norm": 2.484375, + "learning_rate": 0.014064432989396896, + "loss": 3.2264, + "mean_token_accuracy": 0.3820326328277588, + "num_tokens": 3853909135.0, + "step": 7540 + }, + { + "epoch": 2.0392103839913465, + "grad_norm": 2.671875, + "learning_rate": 0.01406295065281169, + "loss": 3.1397, + "mean_token_accuracy": 0.39871132373809814, + "num_tokens": 3854433294.0, + "step": 7541 + }, + { + "epoch": 2.039480800432666, + "grad_norm": 2.578125, + "learning_rate": 0.014061468222246147, + "loss": 2.9814, + "mean_token_accuracy": 0.4117462635040283, + "num_tokens": 3854957479.0, + "step": 7542 + }, + { + "epoch": 2.0397512168739858, + "grad_norm": 2.609375, + "learning_rate": 0.014059985697745763, + "loss": 3.2169, + "mean_token_accuracy": 0.3932311534881592, + "num_tokens": 3855461768.0, + "step": 7543 + }, + { + "epoch": 2.0400216333153054, + "grad_norm": 3.03125, + "learning_rate": 0.014058503079356015, + "loss": 3.0828, + "mean_token_accuracy": 0.3919681906700134, + "num_tokens": 3855957663.0, + "step": 7544 + }, + { + "epoch": 2.040292049756625, + "grad_norm": 2.59375, + "learning_rate": 0.014057020367122397, + "loss": 3.2567, + "mean_token_accuracy": 0.39775174856185913, + "num_tokens": 3856481944.0, + "step": 7545 + }, + { + "epoch": 2.0405624661979447, + "grad_norm": 3.9375, + "learning_rate": 0.014055537561090405, + "loss": 3.386, + "mean_token_accuracy": 0.3750420808792114, + "num_tokens": 3857002596.0, + "step": 7546 + }, + { + "epoch": 2.0408328826392643, + "grad_norm": 3.421875, + "learning_rate": 0.014054054661305533, + "loss": 3.2541, + "mean_token_accuracy": 0.3751458525657654, + "num_tokens": 3857526827.0, + "step": 7547 + }, + { + "epoch": 2.041103299080584, + "grad_norm": 3.421875, + "learning_rate": 0.014052571667813285, + "loss": 3.232, + "mean_token_accuracy": 0.3968048393726349, + "num_tokens": 3858017397.0, + "step": 7548 + }, + { + "epoch": 2.0413737155219036, + "grad_norm": 2.53125, + "learning_rate": 0.01405108858065916, + "loss": 3.0109, + "mean_token_accuracy": 0.41054394841194153, + "num_tokens": 3858541635.0, + "step": 7549 + }, + { + "epoch": 2.0416441319632233, + "grad_norm": 2.96875, + "learning_rate": 0.014049605399888667, + "loss": 3.2335, + "mean_token_accuracy": 0.38894301652908325, + "num_tokens": 3859065852.0, + "step": 7550 + }, + { + "epoch": 2.041914548404543, + "grad_norm": 23.875, + "learning_rate": 0.014048122125547312, + "loss": 16.4278, + "mean_token_accuracy": 0.022714028134942055, + "num_tokens": 3859590088.0, + "step": 7551 + }, + { + "epoch": 2.0421849648458625, + "grad_norm": 7.5, + "learning_rate": 0.0140466387576806, + "loss": 3.6936, + "mean_token_accuracy": 0.34471553564071655, + "num_tokens": 3860114284.0, + "step": 7552 + }, + { + "epoch": 2.042455381287182, + "grad_norm": 2.203125, + "learning_rate": 0.014045155296334058, + "loss": 3.353, + "mean_token_accuracy": 0.36292919516563416, + "num_tokens": 3860638537.0, + "step": 7553 + }, + { + "epoch": 2.042725797728502, + "grad_norm": 5.84375, + "learning_rate": 0.014043671741553194, + "loss": 3.0944, + "mean_token_accuracy": 0.40665245056152344, + "num_tokens": 3861160443.0, + "step": 7554 + }, + { + "epoch": 2.0429962141698215, + "grad_norm": 2.09375, + "learning_rate": 0.01404218809338353, + "loss": 3.3148, + "mean_token_accuracy": 0.37698379158973694, + "num_tokens": 3861684664.0, + "step": 7555 + }, + { + "epoch": 2.043266630611141, + "grad_norm": 3.0625, + "learning_rate": 0.014040704351870588, + "loss": 3.228, + "mean_token_accuracy": 0.3844153583049774, + "num_tokens": 3862208904.0, + "step": 7556 + }, + { + "epoch": 2.0435370470524608, + "grad_norm": 2.859375, + "learning_rate": 0.014039220517059898, + "loss": 3.1378, + "mean_token_accuracy": 0.3919871747493744, + "num_tokens": 3862698746.0, + "step": 7557 + }, + { + "epoch": 2.0438074634937804, + "grad_norm": 3.59375, + "learning_rate": 0.014037736588996982, + "loss": 3.2641, + "mean_token_accuracy": 0.3806452751159668, + "num_tokens": 3863222945.0, + "step": 7558 + }, + { + "epoch": 2.0440778799351, + "grad_norm": 2.828125, + "learning_rate": 0.014036252567727375, + "loss": 3.1865, + "mean_token_accuracy": 0.4163524806499481, + "num_tokens": 3863747210.0, + "step": 7559 + }, + { + "epoch": 2.0443482963764197, + "grad_norm": 3.03125, + "learning_rate": 0.01403476845329661, + "loss": 3.247, + "mean_token_accuracy": 0.3988761901855469, + "num_tokens": 3864271414.0, + "step": 7560 + }, + { + "epoch": 2.0446187128177393, + "grad_norm": 2.3125, + "learning_rate": 0.014033284245750222, + "loss": 3.1685, + "mean_token_accuracy": 0.44564539194107056, + "num_tokens": 3864700963.0, + "step": 7561 + }, + { + "epoch": 2.044889129259059, + "grad_norm": 3.171875, + "learning_rate": 0.014031799945133754, + "loss": 3.1529, + "mean_token_accuracy": 0.4189218282699585, + "num_tokens": 3865188405.0, + "step": 7562 + }, + { + "epoch": 2.0451595457003786, + "grad_norm": 2.953125, + "learning_rate": 0.014030315551492749, + "loss": 3.2022, + "mean_token_accuracy": 0.38843029737472534, + "num_tokens": 3865712395.0, + "step": 7563 + }, + { + "epoch": 2.0454299621416983, + "grad_norm": 3.234375, + "learning_rate": 0.014028831064872752, + "loss": 2.9698, + "mean_token_accuracy": 0.4096655249595642, + "num_tokens": 3866236562.0, + "step": 7564 + }, + { + "epoch": 2.045700378583018, + "grad_norm": 2.78125, + "learning_rate": 0.014027346485319309, + "loss": 3.337, + "mean_token_accuracy": 0.37447699904441833, + "num_tokens": 3866716536.0, + "step": 7565 + }, + { + "epoch": 2.0459707950243375, + "grad_norm": 2.984375, + "learning_rate": 0.014025861812877972, + "loss": 3.2335, + "mean_token_accuracy": 0.3910706341266632, + "num_tokens": 3867215464.0, + "step": 7566 + }, + { + "epoch": 2.046241211465657, + "grad_norm": 2.625, + "learning_rate": 0.0140243770475943, + "loss": 2.9907, + "mean_token_accuracy": 0.39459753036499023, + "num_tokens": 3867739641.0, + "step": 7567 + }, + { + "epoch": 2.046511627906977, + "grad_norm": 2.890625, + "learning_rate": 0.01402289218951384, + "loss": 3.2189, + "mean_token_accuracy": 0.3926776647567749, + "num_tokens": 3868263881.0, + "step": 7568 + }, + { + "epoch": 2.0467820443482965, + "grad_norm": 3.078125, + "learning_rate": 0.014021407238682162, + "loss": 3.1712, + "mean_token_accuracy": 0.40614181756973267, + "num_tokens": 3868788151.0, + "step": 7569 + }, + { + "epoch": 2.047052460789616, + "grad_norm": 3.234375, + "learning_rate": 0.014019922195144825, + "loss": 3.2854, + "mean_token_accuracy": 0.37893146276474, + "num_tokens": 3869312428.0, + "step": 7570 + }, + { + "epoch": 2.0473228772309358, + "grad_norm": 182.0, + "learning_rate": 0.014018437058947394, + "loss": 17.979, + "mean_token_accuracy": 0.0009248762507922947, + "num_tokens": 3869836581.0, + "step": 7571 + }, + { + "epoch": 2.0475932936722554, + "grad_norm": 6.375, + "learning_rate": 0.014016951830135436, + "loss": 3.572, + "mean_token_accuracy": 0.31024062633514404, + "num_tokens": 3870319407.0, + "step": 7572 + }, + { + "epoch": 2.047863710113575, + "grad_norm": 4.0625, + "learning_rate": 0.014015466508754524, + "loss": 3.1191, + "mean_token_accuracy": 0.38753440976142883, + "num_tokens": 3870843608.0, + "step": 7573 + }, + { + "epoch": 2.0481341265548947, + "grad_norm": 2.171875, + "learning_rate": 0.014013981094850232, + "loss": 2.8934, + "mean_token_accuracy": 0.3834935426712036, + "num_tokens": 3871367878.0, + "step": 7574 + }, + { + "epoch": 2.0484045429962143, + "grad_norm": 2.828125, + "learning_rate": 0.014012495588468133, + "loss": 3.3947, + "mean_token_accuracy": 0.36729368567466736, + "num_tokens": 3871892106.0, + "step": 7575 + }, + { + "epoch": 2.048674959437534, + "grad_norm": 2.515625, + "learning_rate": 0.014011009989653815, + "loss": 3.1711, + "mean_token_accuracy": 0.3919543921947479, + "num_tokens": 3872416236.0, + "step": 7576 + }, + { + "epoch": 2.0489453758788536, + "grad_norm": 2.578125, + "learning_rate": 0.014009524298452853, + "loss": 3.2524, + "mean_token_accuracy": 0.38962531089782715, + "num_tokens": 3872940362.0, + "step": 7577 + }, + { + "epoch": 2.0492157923201733, + "grad_norm": 2.390625, + "learning_rate": 0.014008038514910837, + "loss": 3.162, + "mean_token_accuracy": 0.39798402786254883, + "num_tokens": 3873464627.0, + "step": 7578 + }, + { + "epoch": 2.049486208761493, + "grad_norm": 2.703125, + "learning_rate": 0.014006552639073354, + "loss": 2.9997, + "mean_token_accuracy": 0.4523191452026367, + "num_tokens": 3873988747.0, + "step": 7579 + }, + { + "epoch": 2.0497566252028125, + "grad_norm": 2.875, + "learning_rate": 0.014005066670985993, + "loss": 3.2868, + "mean_token_accuracy": 0.37115752696990967, + "num_tokens": 3874512958.0, + "step": 7580 + }, + { + "epoch": 2.0500270416441317, + "grad_norm": 4.15625, + "learning_rate": 0.014003580610694349, + "loss": 3.4092, + "mean_token_accuracy": 0.37366020679473877, + "num_tokens": 3875037210.0, + "step": 7581 + }, + { + "epoch": 2.0502974580854514, + "grad_norm": 3.0625, + "learning_rate": 0.014002094458244017, + "loss": 2.9816, + "mean_token_accuracy": 0.41267916560173035, + "num_tokens": 3875561298.0, + "step": 7582 + }, + { + "epoch": 2.050567874526771, + "grad_norm": 3.71875, + "learning_rate": 0.0140006082136806, + "loss": 3.2369, + "mean_token_accuracy": 0.38633495569229126, + "num_tokens": 3876067699.0, + "step": 7583 + }, + { + "epoch": 2.0508382909680907, + "grad_norm": 3.09375, + "learning_rate": 0.013999121877049698, + "loss": 3.332, + "mean_token_accuracy": 0.34483814239501953, + "num_tokens": 3876591962.0, + "step": 7584 + }, + { + "epoch": 2.0511087074094103, + "grad_norm": 2.984375, + "learning_rate": 0.01399763544839692, + "loss": 3.078, + "mean_token_accuracy": 0.39746883511543274, + "num_tokens": 3877116206.0, + "step": 7585 + }, + { + "epoch": 2.05137912385073, + "grad_norm": 2.71875, + "learning_rate": 0.01399614892776787, + "loss": 3.2827, + "mean_token_accuracy": 0.3714621067047119, + "num_tokens": 3877640380.0, + "step": 7586 + }, + { + "epoch": 2.0516495402920496, + "grad_norm": 2.640625, + "learning_rate": 0.013994662315208158, + "loss": 3.286, + "mean_token_accuracy": 0.38539859652519226, + "num_tokens": 3878164663.0, + "step": 7587 + }, + { + "epoch": 2.0519199567333692, + "grad_norm": 3.453125, + "learning_rate": 0.013993175610763398, + "loss": 3.4338, + "mean_token_accuracy": 0.3787493109703064, + "num_tokens": 3878647408.0, + "step": 7588 + }, + { + "epoch": 2.052190373174689, + "grad_norm": 2.671875, + "learning_rate": 0.01399168881447921, + "loss": 3.0371, + "mean_token_accuracy": 0.4261687695980072, + "num_tokens": 3879153980.0, + "step": 7589 + }, + { + "epoch": 2.0524607896160085, + "grad_norm": 3.859375, + "learning_rate": 0.013990201926401212, + "loss": 3.5094, + "mean_token_accuracy": 0.36947762966156006, + "num_tokens": 3879678191.0, + "step": 7590 + }, + { + "epoch": 2.052731206057328, + "grad_norm": 120.0, + "learning_rate": 0.013988714946575026, + "loss": 11.6189, + "mean_token_accuracy": 0.02934563159942627, + "num_tokens": 3880178577.0, + "step": 7591 + }, + { + "epoch": 2.053001622498648, + "grad_norm": 6.84375, + "learning_rate": 0.013987227875046273, + "loss": 3.7914, + "mean_token_accuracy": 0.3098633885383606, + "num_tokens": 3880702707.0, + "step": 7592 + }, + { + "epoch": 2.0532720389399675, + "grad_norm": 2.078125, + "learning_rate": 0.013985740711860586, + "loss": 3.2836, + "mean_token_accuracy": 0.3820856511592865, + "num_tokens": 3881226978.0, + "step": 7593 + }, + { + "epoch": 2.053542455381287, + "grad_norm": 2.484375, + "learning_rate": 0.013984253457063594, + "loss": 3.1616, + "mean_token_accuracy": 0.40435656905174255, + "num_tokens": 3881751248.0, + "step": 7594 + }, + { + "epoch": 2.0538128718226067, + "grad_norm": 2.875, + "learning_rate": 0.01398276611070093, + "loss": 3.1475, + "mean_token_accuracy": 0.3867489695549011, + "num_tokens": 3882275510.0, + "step": 7595 + }, + { + "epoch": 2.0540832882639264, + "grad_norm": 2.375, + "learning_rate": 0.013981278672818228, + "loss": 3.2353, + "mean_token_accuracy": 0.39511850476264954, + "num_tokens": 3882799609.0, + "step": 7596 + }, + { + "epoch": 2.054353704705246, + "grad_norm": 3.453125, + "learning_rate": 0.013979791143461128, + "loss": 3.1513, + "mean_token_accuracy": 0.40412312746047974, + "num_tokens": 3883323815.0, + "step": 7597 + }, + { + "epoch": 2.0546241211465657, + "grad_norm": 3.15625, + "learning_rate": 0.013978303522675276, + "loss": 3.2458, + "mean_token_accuracy": 0.4135640859603882, + "num_tokens": 3883790018.0, + "step": 7598 + }, + { + "epoch": 2.0548945375878853, + "grad_norm": 2.734375, + "learning_rate": 0.013976815810506313, + "loss": 3.1795, + "mean_token_accuracy": 0.3815675377845764, + "num_tokens": 3884314234.0, + "step": 7599 + }, + { + "epoch": 2.055164954029205, + "grad_norm": 2.328125, + "learning_rate": 0.013975328006999885, + "loss": 3.2276, + "mean_token_accuracy": 0.40094518661499023, + "num_tokens": 3884838490.0, + "step": 7600 + }, + { + "epoch": 2.0554353704705246, + "grad_norm": 2.9375, + "learning_rate": 0.013973840112201643, + "loss": 3.2699, + "mean_token_accuracy": 0.398322731256485, + "num_tokens": 3885280894.0, + "step": 7601 + }, + { + "epoch": 2.0557057869118442, + "grad_norm": 2.390625, + "learning_rate": 0.013972352126157241, + "loss": 3.1636, + "mean_token_accuracy": 0.43608999252319336, + "num_tokens": 3885740932.0, + "step": 7602 + }, + { + "epoch": 2.055976203353164, + "grad_norm": 2.578125, + "learning_rate": 0.013970864048912337, + "loss": 3.0172, + "mean_token_accuracy": 0.4005451500415802, + "num_tokens": 3886265205.0, + "step": 7603 + }, + { + "epoch": 2.0562466197944835, + "grad_norm": 2.109375, + "learning_rate": 0.013969375880512587, + "loss": 3.2723, + "mean_token_accuracy": 0.3958137035369873, + "num_tokens": 3886748800.0, + "step": 7604 + }, + { + "epoch": 2.056517036235803, + "grad_norm": 3.15625, + "learning_rate": 0.013967887621003651, + "loss": 3.1895, + "mean_token_accuracy": 0.4202324151992798, + "num_tokens": 3887150741.0, + "step": 7605 + }, + { + "epoch": 2.056787452677123, + "grad_norm": 2.703125, + "learning_rate": 0.013966399270431197, + "loss": 3.0282, + "mean_token_accuracy": 0.43984490633010864, + "num_tokens": 3887614932.0, + "step": 7606 + }, + { + "epoch": 2.0570578691184425, + "grad_norm": 3.28125, + "learning_rate": 0.013964910828840889, + "loss": 3.1363, + "mean_token_accuracy": 0.389222115278244, + "num_tokens": 3888139047.0, + "step": 7607 + }, + { + "epoch": 2.057328285559762, + "grad_norm": 2.96875, + "learning_rate": 0.013963422296278396, + "loss": 3.2294, + "mean_token_accuracy": 0.38993173837661743, + "num_tokens": 3888652642.0, + "step": 7608 + }, + { + "epoch": 2.0575987020010817, + "grad_norm": 3.515625, + "learning_rate": 0.013961933672789398, + "loss": 3.2385, + "mean_token_accuracy": 0.37208104133605957, + "num_tokens": 3889176813.0, + "step": 7609 + }, + { + "epoch": 2.0578691184424014, + "grad_norm": 2.734375, + "learning_rate": 0.013960444958419556, + "loss": 2.9879, + "mean_token_accuracy": 0.4106433391571045, + "num_tokens": 3889678988.0, + "step": 7610 + }, + { + "epoch": 2.058139534883721, + "grad_norm": 20.375, + "learning_rate": 0.013958956153214559, + "loss": 10.1798, + "mean_token_accuracy": 0.01921110600233078, + "num_tokens": 3890203260.0, + "step": 7611 + }, + { + "epoch": 2.0584099513250407, + "grad_norm": 6.40625, + "learning_rate": 0.013957467257220085, + "loss": 3.5786, + "mean_token_accuracy": 0.35724034905433655, + "num_tokens": 3890727461.0, + "step": 7612 + }, + { + "epoch": 2.0586803677663603, + "grad_norm": 2.1875, + "learning_rate": 0.013955978270481822, + "loss": 3.1868, + "mean_token_accuracy": 0.3852211833000183, + "num_tokens": 3891251604.0, + "step": 7613 + }, + { + "epoch": 2.05895078420768, + "grad_norm": 2.296875, + "learning_rate": 0.01395448919304545, + "loss": 3.2485, + "mean_token_accuracy": 0.398928165435791, + "num_tokens": 3891775714.0, + "step": 7614 + }, + { + "epoch": 2.0592212006489996, + "grad_norm": 2.4375, + "learning_rate": 0.013953000024956662, + "loss": 3.0726, + "mean_token_accuracy": 0.4072415828704834, + "num_tokens": 3892252486.0, + "step": 7615 + }, + { + "epoch": 2.0594916170903192, + "grad_norm": 2.75, + "learning_rate": 0.01395151076626115, + "loss": 3.3414, + "mean_token_accuracy": 0.3948609232902527, + "num_tokens": 3892776699.0, + "step": 7616 + }, + { + "epoch": 2.059762033531639, + "grad_norm": 3.4375, + "learning_rate": 0.013950021417004606, + "loss": 3.0702, + "mean_token_accuracy": 0.3844633102416992, + "num_tokens": 3893300942.0, + "step": 7617 + }, + { + "epoch": 2.0600324499729585, + "grad_norm": 2.9375, + "learning_rate": 0.013948531977232727, + "loss": 3.0344, + "mean_token_accuracy": 0.3844373822212219, + "num_tokens": 3893825101.0, + "step": 7618 + }, + { + "epoch": 2.060302866414278, + "grad_norm": 3.125, + "learning_rate": 0.01394704244699122, + "loss": 3.1488, + "mean_token_accuracy": 0.3806011378765106, + "num_tokens": 3894349076.0, + "step": 7619 + }, + { + "epoch": 2.060573282855598, + "grad_norm": 2.796875, + "learning_rate": 0.013945552826325782, + "loss": 3.0798, + "mean_token_accuracy": 0.3823135495185852, + "num_tokens": 3894843997.0, + "step": 7620 + }, + { + "epoch": 2.0608436992969175, + "grad_norm": 3.390625, + "learning_rate": 0.013944063115282124, + "loss": 3.2014, + "mean_token_accuracy": 0.36672669649124146, + "num_tokens": 3895368176.0, + "step": 7621 + }, + { + "epoch": 2.0611141157382367, + "grad_norm": 2.359375, + "learning_rate": 0.013942573313905951, + "loss": 3.0578, + "mean_token_accuracy": 0.4062029719352722, + "num_tokens": 3895882822.0, + "step": 7622 + }, + { + "epoch": 2.0613845321795563, + "grad_norm": 4.6875, + "learning_rate": 0.013941083422242973, + "loss": 3.2734, + "mean_token_accuracy": 0.4125145673751831, + "num_tokens": 3896406994.0, + "step": 7623 + }, + { + "epoch": 2.061654948620876, + "grad_norm": 3.515625, + "learning_rate": 0.013939593440338904, + "loss": 3.4543, + "mean_token_accuracy": 0.37621086835861206, + "num_tokens": 3896931274.0, + "step": 7624 + }, + { + "epoch": 2.0619253650621956, + "grad_norm": 14.9375, + "learning_rate": 0.013938103368239467, + "loss": 3.4443, + "mean_token_accuracy": 0.4094623327255249, + "num_tokens": 3897364500.0, + "step": 7625 + }, + { + "epoch": 2.0621957815035152, + "grad_norm": 3.015625, + "learning_rate": 0.013936613205990378, + "loss": 3.3332, + "mean_token_accuracy": 0.373924195766449, + "num_tokens": 3897848904.0, + "step": 7626 + }, + { + "epoch": 2.062466197944835, + "grad_norm": 2.734375, + "learning_rate": 0.013935122953637358, + "loss": 3.2449, + "mean_token_accuracy": 0.3787178099155426, + "num_tokens": 3898373099.0, + "step": 7627 + }, + { + "epoch": 2.0627366143861545, + "grad_norm": 3.125, + "learning_rate": 0.013933632611226137, + "loss": 3.2176, + "mean_token_accuracy": 0.3851190507411957, + "num_tokens": 3898897202.0, + "step": 7628 + }, + { + "epoch": 2.063007030827474, + "grad_norm": 3.125, + "learning_rate": 0.01393214217880244, + "loss": 3.1825, + "mean_token_accuracy": 0.41015028953552246, + "num_tokens": 3899421444.0, + "step": 7629 + }, + { + "epoch": 2.063277447268794, + "grad_norm": 4.6875, + "learning_rate": 0.013930651656411998, + "loss": 3.2175, + "mean_token_accuracy": 0.392638236284256, + "num_tokens": 3899945727.0, + "step": 7630 + }, + { + "epoch": 2.0635478637101135, + "grad_norm": 44.75, + "learning_rate": 0.013929161044100542, + "loss": 11.9594, + "mean_token_accuracy": 0.009973889216780663, + "num_tokens": 3900383545.0, + "step": 7631 + }, + { + "epoch": 2.063818280151433, + "grad_norm": 3.640625, + "learning_rate": 0.013927670341913811, + "loss": 3.4034, + "mean_token_accuracy": 0.3789122700691223, + "num_tokens": 3900907732.0, + "step": 7632 + }, + { + "epoch": 2.0640886965927527, + "grad_norm": 2.203125, + "learning_rate": 0.013926179549897548, + "loss": 3.1767, + "mean_token_accuracy": 0.3821791708469391, + "num_tokens": 3901373034.0, + "step": 7633 + }, + { + "epoch": 2.0643591130340724, + "grad_norm": 3.203125, + "learning_rate": 0.01392468866809749, + "loss": 3.0962, + "mean_token_accuracy": 0.44575849175453186, + "num_tokens": 3901897197.0, + "step": 7634 + }, + { + "epoch": 2.064629529475392, + "grad_norm": 3.0, + "learning_rate": 0.01392319769655938, + "loss": 3.1091, + "mean_token_accuracy": 0.3928613066673279, + "num_tokens": 3902418271.0, + "step": 7635 + }, + { + "epoch": 2.0648999459167117, + "grad_norm": 3.40625, + "learning_rate": 0.013921706635328972, + "loss": 3.35, + "mean_token_accuracy": 0.3924407958984375, + "num_tokens": 3902942412.0, + "step": 7636 + }, + { + "epoch": 2.0651703623580313, + "grad_norm": 2.875, + "learning_rate": 0.01392021548445201, + "loss": 3.2371, + "mean_token_accuracy": 0.37533169984817505, + "num_tokens": 3903466596.0, + "step": 7637 + }, + { + "epoch": 2.065440778799351, + "grad_norm": 2.453125, + "learning_rate": 0.013918724243974247, + "loss": 2.9516, + "mean_token_accuracy": 0.4051320552825928, + "num_tokens": 3903990845.0, + "step": 7638 + }, + { + "epoch": 2.0657111952406706, + "grad_norm": 2.578125, + "learning_rate": 0.013917232913941445, + "loss": 3.257, + "mean_token_accuracy": 0.3808358907699585, + "num_tokens": 3904486779.0, + "step": 7639 + }, + { + "epoch": 2.0659816116819902, + "grad_norm": 2.84375, + "learning_rate": 0.013915741494399354, + "loss": 3.3337, + "mean_token_accuracy": 0.4177692234516144, + "num_tokens": 3904881749.0, + "step": 7640 + }, + { + "epoch": 2.06625202812331, + "grad_norm": 2.53125, + "learning_rate": 0.013914249985393742, + "loss": 3.1742, + "mean_token_accuracy": 0.37177324295043945, + "num_tokens": 3905405821.0, + "step": 7641 + }, + { + "epoch": 2.0665224445646295, + "grad_norm": 2.78125, + "learning_rate": 0.013912758386970371, + "loss": 3.4411, + "mean_token_accuracy": 0.3488498330116272, + "num_tokens": 3905930076.0, + "step": 7642 + }, + { + "epoch": 2.066792861005949, + "grad_norm": 3.40625, + "learning_rate": 0.013911266699175006, + "loss": 3.2581, + "mean_token_accuracy": 0.38801002502441406, + "num_tokens": 3906454251.0, + "step": 7643 + }, + { + "epoch": 2.067063277447269, + "grad_norm": 2.890625, + "learning_rate": 0.013909774922053418, + "loss": 3.2854, + "mean_token_accuracy": 0.39837324619293213, + "num_tokens": 3906950014.0, + "step": 7644 + }, + { + "epoch": 2.0673336938885885, + "grad_norm": 3.71875, + "learning_rate": 0.013908283055651376, + "loss": 3.082, + "mean_token_accuracy": 0.3956509828567505, + "num_tokens": 3907474172.0, + "step": 7645 + }, + { + "epoch": 2.067604110329908, + "grad_norm": 2.703125, + "learning_rate": 0.013906791100014658, + "loss": 3.0676, + "mean_token_accuracy": 0.4106798768043518, + "num_tokens": 3907955764.0, + "step": 7646 + }, + { + "epoch": 2.0678745267712277, + "grad_norm": 3.4375, + "learning_rate": 0.01390529905518904, + "loss": 3.3576, + "mean_token_accuracy": 0.38567274808883667, + "num_tokens": 3908479947.0, + "step": 7647 + }, + { + "epoch": 2.0681449432125474, + "grad_norm": 2.953125, + "learning_rate": 0.013903806921220303, + "loss": 3.2879, + "mean_token_accuracy": 0.39723604917526245, + "num_tokens": 3909004116.0, + "step": 7648 + }, + { + "epoch": 2.068415359653867, + "grad_norm": 3.234375, + "learning_rate": 0.013902314698154235, + "loss": 3.1747, + "mean_token_accuracy": 0.38466596603393555, + "num_tokens": 3909528329.0, + "step": 7649 + }, + { + "epoch": 2.0686857760951867, + "grad_norm": 2.390625, + "learning_rate": 0.01390082238603661, + "loss": 3.0007, + "mean_token_accuracy": 0.4135932922363281, + "num_tokens": 3910040711.0, + "step": 7650 + }, + { + "epoch": 2.0689561925365063, + "grad_norm": 324.0, + "learning_rate": 0.013899329984913228, + "loss": 15.0767, + "mean_token_accuracy": 3.462105814833194e-05, + "num_tokens": 3910540127.0, + "step": 7651 + }, + { + "epoch": 2.069226608977826, + "grad_norm": 9.125, + "learning_rate": 0.013897837494829876, + "loss": 3.9652, + "mean_token_accuracy": 0.3115316927433014, + "num_tokens": 3911064393.0, + "step": 7652 + }, + { + "epoch": 2.0694970254191456, + "grad_norm": 2.5625, + "learning_rate": 0.013896344915832344, + "loss": 3.2643, + "mean_token_accuracy": 0.3838593661785126, + "num_tokens": 3911569474.0, + "step": 7653 + }, + { + "epoch": 2.0697674418604652, + "grad_norm": 3.296875, + "learning_rate": 0.013894852247966435, + "loss": 3.1197, + "mean_token_accuracy": 0.3995039463043213, + "num_tokens": 3912088349.0, + "step": 7654 + }, + { + "epoch": 2.070037858301785, + "grad_norm": 2.78125, + "learning_rate": 0.013893359491277946, + "loss": 3.2042, + "mean_token_accuracy": 0.3772951364517212, + "num_tokens": 3912612624.0, + "step": 7655 + }, + { + "epoch": 2.0703082747431045, + "grad_norm": 2.453125, + "learning_rate": 0.013891866645812677, + "loss": 3.1944, + "mean_token_accuracy": 0.39957594871520996, + "num_tokens": 3913094142.0, + "step": 7656 + }, + { + "epoch": 2.070578691184424, + "grad_norm": 2.265625, + "learning_rate": 0.013890373711616435, + "loss": 3.269, + "mean_token_accuracy": 0.38015246391296387, + "num_tokens": 3913561739.0, + "step": 7657 + }, + { + "epoch": 2.070849107625744, + "grad_norm": 2.21875, + "learning_rate": 0.013888880688735028, + "loss": 3.0274, + "mean_token_accuracy": 0.404524564743042, + "num_tokens": 3914085940.0, + "step": 7658 + }, + { + "epoch": 2.0711195240670635, + "grad_norm": 2.59375, + "learning_rate": 0.01388738757721427, + "loss": 3.1218, + "mean_token_accuracy": 0.38591110706329346, + "num_tokens": 3914610161.0, + "step": 7659 + }, + { + "epoch": 2.071389940508383, + "grad_norm": 3.921875, + "learning_rate": 0.013885894377099962, + "loss": 3.477, + "mean_token_accuracy": 0.38291195034980774, + "num_tokens": 3915134373.0, + "step": 7660 + }, + { + "epoch": 2.0716603569497027, + "grad_norm": 3.53125, + "learning_rate": 0.013884401088437932, + "loss": 3.0421, + "mean_token_accuracy": 0.440078467130661, + "num_tokens": 3915658621.0, + "step": 7661 + }, + { + "epoch": 2.0719307733910224, + "grad_norm": 2.078125, + "learning_rate": 0.013882907711273996, + "loss": 3.0664, + "mean_token_accuracy": 0.4048006534576416, + "num_tokens": 3916182825.0, + "step": 7662 + }, + { + "epoch": 2.0722011898323416, + "grad_norm": 3.515625, + "learning_rate": 0.01388141424565397, + "loss": 3.2438, + "mean_token_accuracy": 0.37336450815200806, + "num_tokens": 3916707008.0, + "step": 7663 + }, + { + "epoch": 2.0724716062736612, + "grad_norm": 3.203125, + "learning_rate": 0.013879920691623682, + "loss": 3.0803, + "mean_token_accuracy": 0.40325838327407837, + "num_tokens": 3917231190.0, + "step": 7664 + }, + { + "epoch": 2.072742022714981, + "grad_norm": 2.953125, + "learning_rate": 0.013878427049228962, + "loss": 3.2855, + "mean_token_accuracy": 0.37850937247276306, + "num_tokens": 3917755289.0, + "step": 7665 + }, + { + "epoch": 2.0730124391563005, + "grad_norm": 2.703125, + "learning_rate": 0.01387693331851563, + "loss": 3.3013, + "mean_token_accuracy": 0.3852534890174866, + "num_tokens": 3918279529.0, + "step": 7666 + }, + { + "epoch": 2.07328285559762, + "grad_norm": 2.296875, + "learning_rate": 0.013875439499529523, + "loss": 2.9792, + "mean_token_accuracy": 0.42124316096305847, + "num_tokens": 3918803682.0, + "step": 7667 + }, + { + "epoch": 2.07355327203894, + "grad_norm": 2.5, + "learning_rate": 0.01387394559231648, + "loss": 3.014, + "mean_token_accuracy": 0.40647900104522705, + "num_tokens": 3919327741.0, + "step": 7668 + }, + { + "epoch": 2.0738236884802594, + "grad_norm": 2.625, + "learning_rate": 0.01387245159692233, + "loss": 3.2703, + "mean_token_accuracy": 0.38230809569358826, + "num_tokens": 3919852012.0, + "step": 7669 + }, + { + "epoch": 2.074094104921579, + "grad_norm": 2.671875, + "learning_rate": 0.013870957513392922, + "loss": 2.8878, + "mean_token_accuracy": 0.40190890431404114, + "num_tokens": 3920352252.0, + "step": 7670 + }, + { + "epoch": 2.0743645213628987, + "grad_norm": 55.25, + "learning_rate": 0.013869463341774092, + "loss": 11.0372, + "mean_token_accuracy": 0.03295872360467911, + "num_tokens": 3920876458.0, + "step": 7671 + }, + { + "epoch": 2.0746349378042184, + "grad_norm": 5.84375, + "learning_rate": 0.013867969082111692, + "loss": 3.5173, + "mean_token_accuracy": 0.36187201738357544, + "num_tokens": 3921400705.0, + "step": 7672 + }, + { + "epoch": 2.074905354245538, + "grad_norm": 1.921875, + "learning_rate": 0.013866474734451562, + "loss": 3.306, + "mean_token_accuracy": 0.39144375920295715, + "num_tokens": 3921880698.0, + "step": 7673 + }, + { + "epoch": 2.0751757706868577, + "grad_norm": 3.5, + "learning_rate": 0.013864980298839558, + "loss": 3.3386, + "mean_token_accuracy": 0.37941229343414307, + "num_tokens": 3922404917.0, + "step": 7674 + }, + { + "epoch": 2.0754461871281773, + "grad_norm": 2.796875, + "learning_rate": 0.013863485775321534, + "loss": 3.1793, + "mean_token_accuracy": 0.40728360414505005, + "num_tokens": 3922929130.0, + "step": 7675 + }, + { + "epoch": 2.075716603569497, + "grad_norm": 3.5625, + "learning_rate": 0.013861991163943344, + "loss": 3.4042, + "mean_token_accuracy": 0.38472557067871094, + "num_tokens": 3923453357.0, + "step": 7676 + }, + { + "epoch": 2.0759870200108166, + "grad_norm": 3.171875, + "learning_rate": 0.013860496464750849, + "loss": 3.1022, + "mean_token_accuracy": 0.39677441120147705, + "num_tokens": 3923977430.0, + "step": 7677 + }, + { + "epoch": 2.0762574364521362, + "grad_norm": 2.453125, + "learning_rate": 0.013859001677789914, + "loss": 2.9493, + "mean_token_accuracy": 0.4448332190513611, + "num_tokens": 3924501678.0, + "step": 7678 + }, + { + "epoch": 2.076527852893456, + "grad_norm": 2.875, + "learning_rate": 0.013857506803106394, + "loss": 3.3484, + "mean_token_accuracy": 0.39267587661743164, + "num_tokens": 3924982617.0, + "step": 7679 + }, + { + "epoch": 2.0767982693347755, + "grad_norm": 2.46875, + "learning_rate": 0.013856011840746165, + "loss": 3.1799, + "mean_token_accuracy": 0.3869524598121643, + "num_tokens": 3925506875.0, + "step": 7680 + }, + { + "epoch": 2.077068685776095, + "grad_norm": 2.578125, + "learning_rate": 0.013854516790755094, + "loss": 3.0335, + "mean_token_accuracy": 0.4084114730358124, + "num_tokens": 3926030998.0, + "step": 7681 + }, + { + "epoch": 2.077339102217415, + "grad_norm": 2.8125, + "learning_rate": 0.013853021653179053, + "loss": 3.2311, + "mean_token_accuracy": 0.39353126287460327, + "num_tokens": 3926555161.0, + "step": 7682 + }, + { + "epoch": 2.0776095186587344, + "grad_norm": 3.796875, + "learning_rate": 0.013851526428063914, + "loss": 3.1703, + "mean_token_accuracy": 0.39283984899520874, + "num_tokens": 3927059135.0, + "step": 7683 + }, + { + "epoch": 2.077879935100054, + "grad_norm": 2.84375, + "learning_rate": 0.01385003111545556, + "loss": 3.1205, + "mean_token_accuracy": 0.3979465663433075, + "num_tokens": 3927583387.0, + "step": 7684 + }, + { + "epoch": 2.0781503515413737, + "grad_norm": 3.109375, + "learning_rate": 0.013848535715399875, + "loss": 3.1472, + "mean_token_accuracy": 0.3720361292362213, + "num_tokens": 3928107634.0, + "step": 7685 + }, + { + "epoch": 2.0784207679826934, + "grad_norm": 2.6875, + "learning_rate": 0.013847040227942736, + "loss": 3.2151, + "mean_token_accuracy": 0.3886496126651764, + "num_tokens": 3928631875.0, + "step": 7686 + }, + { + "epoch": 2.078691184424013, + "grad_norm": 3.25, + "learning_rate": 0.013845544653130027, + "loss": 3.3365, + "mean_token_accuracy": 0.3992064893245697, + "num_tokens": 3929106042.0, + "step": 7687 + }, + { + "epoch": 2.0789616008653327, + "grad_norm": 2.390625, + "learning_rate": 0.013844048991007638, + "loss": 3.1591, + "mean_token_accuracy": 0.40659838914871216, + "num_tokens": 3929630315.0, + "step": 7688 + }, + { + "epoch": 2.0792320173066523, + "grad_norm": 3.0, + "learning_rate": 0.013842553241621466, + "loss": 3.1383, + "mean_token_accuracy": 0.39467769861221313, + "num_tokens": 3930154595.0, + "step": 7689 + }, + { + "epoch": 2.079502433747972, + "grad_norm": 2.96875, + "learning_rate": 0.0138410574050174, + "loss": 3.0862, + "mean_token_accuracy": 0.3873860538005829, + "num_tokens": 3930678793.0, + "step": 7690 + }, + { + "epoch": 2.0797728501892916, + "grad_norm": 144.0, + "learning_rate": 0.013839561481241339, + "loss": 17.2563, + "mean_token_accuracy": 7.868357351981103e-05, + "num_tokens": 3931195987.0, + "step": 7691 + }, + { + "epoch": 2.0800432666306112, + "grad_norm": 5.65625, + "learning_rate": 0.013838065470339181, + "loss": 3.6363, + "mean_token_accuracy": 0.37621617317199707, + "num_tokens": 3931720022.0, + "step": 7692 + }, + { + "epoch": 2.080313683071931, + "grad_norm": 2.15625, + "learning_rate": 0.013836569372356826, + "loss": 3.2054, + "mean_token_accuracy": 0.3929956555366516, + "num_tokens": 3932244306.0, + "step": 7693 + }, + { + "epoch": 2.0805840995132505, + "grad_norm": 2.203125, + "learning_rate": 0.013835073187340181, + "loss": 3.3053, + "mean_token_accuracy": 0.37642979621887207, + "num_tokens": 3932768391.0, + "step": 7694 + }, + { + "epoch": 2.08085451595457, + "grad_norm": 2.890625, + "learning_rate": 0.013833576915335155, + "loss": 3.2636, + "mean_token_accuracy": 0.39570915699005127, + "num_tokens": 3933257543.0, + "step": 7695 + }, + { + "epoch": 2.08112493239589, + "grad_norm": 2.890625, + "learning_rate": 0.013832080556387655, + "loss": 3.3036, + "mean_token_accuracy": 0.39234721660614014, + "num_tokens": 3933753511.0, + "step": 7696 + }, + { + "epoch": 2.0813953488372094, + "grad_norm": 2.96875, + "learning_rate": 0.013830584110543595, + "loss": 3.2164, + "mean_token_accuracy": 0.39236944913864136, + "num_tokens": 3934277630.0, + "step": 7697 + }, + { + "epoch": 2.081665765278529, + "grad_norm": 2.578125, + "learning_rate": 0.013829087577848894, + "loss": 3.1733, + "mean_token_accuracy": 0.3973180055618286, + "num_tokens": 3934794084.0, + "step": 7698 + }, + { + "epoch": 2.0819361817198487, + "grad_norm": 3.171875, + "learning_rate": 0.013827590958349461, + "loss": 3.2848, + "mean_token_accuracy": 0.3710297644138336, + "num_tokens": 3935318185.0, + "step": 7699 + }, + { + "epoch": 2.0822065981611684, + "grad_norm": 2.90625, + "learning_rate": 0.013826094252091223, + "loss": 3.3159, + "mean_token_accuracy": 0.39192745089530945, + "num_tokens": 3935802096.0, + "step": 7700 + }, + { + "epoch": 2.082477014602488, + "grad_norm": 4.15625, + "learning_rate": 0.013824597459120106, + "loss": 3.3973, + "mean_token_accuracy": 0.35050034523010254, + "num_tokens": 3936326376.0, + "step": 7701 + }, + { + "epoch": 2.0827474310438077, + "grad_norm": 3.9375, + "learning_rate": 0.013823100579482028, + "loss": 3.3924, + "mean_token_accuracy": 0.3916892111301422, + "num_tokens": 3936823544.0, + "step": 7702 + }, + { + "epoch": 2.0830178474851273, + "grad_norm": 3.578125, + "learning_rate": 0.013821603613222922, + "loss": 3.3746, + "mean_token_accuracy": 0.3741528391838074, + "num_tokens": 3937347727.0, + "step": 7703 + }, + { + "epoch": 2.0832882639264465, + "grad_norm": 2.5625, + "learning_rate": 0.013820106560388724, + "loss": 3.3578, + "mean_token_accuracy": 0.3989184498786926, + "num_tokens": 3937831973.0, + "step": 7704 + }, + { + "epoch": 2.083558680367766, + "grad_norm": 2.609375, + "learning_rate": 0.013818609421025361, + "loss": 3.2026, + "mean_token_accuracy": 0.382987916469574, + "num_tokens": 3938356250.0, + "step": 7705 + }, + { + "epoch": 2.083829096809086, + "grad_norm": 3.125, + "learning_rate": 0.013817112195178768, + "loss": 2.9271, + "mean_token_accuracy": 0.41513174772262573, + "num_tokens": 3938775404.0, + "step": 7706 + }, + { + "epoch": 2.0840995132504054, + "grad_norm": 2.78125, + "learning_rate": 0.013815614882894893, + "loss": 3.1193, + "mean_token_accuracy": 0.3830118775367737, + "num_tokens": 3939299692.0, + "step": 7707 + }, + { + "epoch": 2.084369929691725, + "grad_norm": 2.453125, + "learning_rate": 0.013814117484219676, + "loss": 3.0582, + "mean_token_accuracy": 0.39131686091423035, + "num_tokens": 3939823869.0, + "step": 7708 + }, + { + "epoch": 2.0846403461330447, + "grad_norm": 4.40625, + "learning_rate": 0.013812619999199052, + "loss": 2.9925, + "mean_token_accuracy": 0.41453588008880615, + "num_tokens": 3940343769.0, + "step": 7709 + }, + { + "epoch": 2.0849107625743644, + "grad_norm": 2.90625, + "learning_rate": 0.013811122427878976, + "loss": 3.207, + "mean_token_accuracy": 0.3890611231327057, + "num_tokens": 3940867983.0, + "step": 7710 + }, + { + "epoch": 2.085181179015684, + "grad_norm": 47.5, + "learning_rate": 0.0138096247703054, + "loss": 11.9311, + "mean_token_accuracy": 0.008903317153453827, + "num_tokens": 3941392202.0, + "step": 7711 + }, + { + "epoch": 2.0854515954570036, + "grad_norm": 5.875, + "learning_rate": 0.01380812702652427, + "loss": 3.5911, + "mean_token_accuracy": 0.33787786960601807, + "num_tokens": 3941916391.0, + "step": 7712 + }, + { + "epoch": 2.0857220118983233, + "grad_norm": 2.046875, + "learning_rate": 0.013806629196581549, + "loss": 3.095, + "mean_token_accuracy": 0.3964437246322632, + "num_tokens": 3942440537.0, + "step": 7713 + }, + { + "epoch": 2.085992428339643, + "grad_norm": 2.90625, + "learning_rate": 0.013805131280523188, + "loss": 3.1554, + "mean_token_accuracy": 0.3906751871109009, + "num_tokens": 3942964782.0, + "step": 7714 + }, + { + "epoch": 2.0862628447809626, + "grad_norm": 3.1875, + "learning_rate": 0.013803633278395152, + "loss": 3.3342, + "mean_token_accuracy": 0.3858497142791748, + "num_tokens": 3943481261.0, + "step": 7715 + }, + { + "epoch": 2.086533261222282, + "grad_norm": 2.703125, + "learning_rate": 0.0138021351902434, + "loss": 3.1572, + "mean_token_accuracy": 0.3897297978401184, + "num_tokens": 3944005455.0, + "step": 7716 + }, + { + "epoch": 2.086803677663602, + "grad_norm": 2.875, + "learning_rate": 0.013800637016113898, + "loss": 3.0783, + "mean_token_accuracy": 0.41915178298950195, + "num_tokens": 3944529613.0, + "step": 7717 + }, + { + "epoch": 2.0870740941049215, + "grad_norm": 2.546875, + "learning_rate": 0.01379913875605262, + "loss": 3.1626, + "mean_token_accuracy": 0.3933075964450836, + "num_tokens": 3945049602.0, + "step": 7718 + }, + { + "epoch": 2.087344510546241, + "grad_norm": 3.28125, + "learning_rate": 0.01379764041010553, + "loss": 3.3012, + "mean_token_accuracy": 0.3928695321083069, + "num_tokens": 3945573886.0, + "step": 7719 + }, + { + "epoch": 2.087614926987561, + "grad_norm": 3.125, + "learning_rate": 0.013796141978318606, + "loss": 3.2537, + "mean_token_accuracy": 0.4189486503601074, + "num_tokens": 3946098165.0, + "step": 7720 + }, + { + "epoch": 2.0878853434288804, + "grad_norm": 3.21875, + "learning_rate": 0.013794643460737825, + "loss": 3.3816, + "mean_token_accuracy": 0.3841911256313324, + "num_tokens": 3946618756.0, + "step": 7721 + }, + { + "epoch": 2.0881557598702, + "grad_norm": 2.25, + "learning_rate": 0.013793144857409163, + "loss": 2.9865, + "mean_token_accuracy": 0.4017699956893921, + "num_tokens": 3947142983.0, + "step": 7722 + }, + { + "epoch": 2.0884261763115197, + "grad_norm": 14.75, + "learning_rate": 0.013791646168378602, + "loss": 2.9678, + "mean_token_accuracy": 0.3840733468532562, + "num_tokens": 3947667084.0, + "step": 7723 + }, + { + "epoch": 2.0886965927528394, + "grad_norm": 3.21875, + "learning_rate": 0.013790147393692127, + "loss": 3.1568, + "mean_token_accuracy": 0.38176846504211426, + "num_tokens": 3948191363.0, + "step": 7724 + }, + { + "epoch": 2.088967009194159, + "grad_norm": 5.6875, + "learning_rate": 0.013788648533395727, + "loss": 3.0189, + "mean_token_accuracy": 0.4366835653781891, + "num_tokens": 3948688345.0, + "step": 7725 + }, + { + "epoch": 2.0892374256354787, + "grad_norm": 2.109375, + "learning_rate": 0.013787149587535383, + "loss": 3.3501, + "mean_token_accuracy": 0.39201411604881287, + "num_tokens": 3949153067.0, + "step": 7726 + }, + { + "epoch": 2.0895078420767983, + "grad_norm": 4.4375, + "learning_rate": 0.0137856505561571, + "loss": 3.436, + "mean_token_accuracy": 0.3594006896018982, + "num_tokens": 3949677264.0, + "step": 7727 + }, + { + "epoch": 2.089778258518118, + "grad_norm": 2.75, + "learning_rate": 0.013784151439306865, + "loss": 3.1744, + "mean_token_accuracy": 0.39548569917678833, + "num_tokens": 3950201543.0, + "step": 7728 + }, + { + "epoch": 2.0900486749594376, + "grad_norm": 3.234375, + "learning_rate": 0.013782652237030672, + "loss": 3.0767, + "mean_token_accuracy": 0.38879650831222534, + "num_tokens": 3950713205.0, + "step": 7729 + }, + { + "epoch": 2.0903190914007572, + "grad_norm": 2.96875, + "learning_rate": 0.013781152949374527, + "loss": 3.2801, + "mean_token_accuracy": 0.39306747913360596, + "num_tokens": 3951202326.0, + "step": 7730 + }, + { + "epoch": 2.090589507842077, + "grad_norm": 26.125, + "learning_rate": 0.013779653576384433, + "loss": 9.9014, + "mean_token_accuracy": 0.007512656506150961, + "num_tokens": 3951726504.0, + "step": 7731 + }, + { + "epoch": 2.0908599242833965, + "grad_norm": 7.6875, + "learning_rate": 0.01377815411810639, + "loss": 3.6279, + "mean_token_accuracy": 0.3098674416542053, + "num_tokens": 3952250758.0, + "step": 7732 + }, + { + "epoch": 2.091130340724716, + "grad_norm": 2.28125, + "learning_rate": 0.013776654574586409, + "loss": 3.183, + "mean_token_accuracy": 0.39132726192474365, + "num_tokens": 3952700869.0, + "step": 7733 + }, + { + "epoch": 2.091400757166036, + "grad_norm": 2.46875, + "learning_rate": 0.013775154945870501, + "loss": 3.2797, + "mean_token_accuracy": 0.39058414101600647, + "num_tokens": 3953225103.0, + "step": 7734 + }, + { + "epoch": 2.0916711736073554, + "grad_norm": 2.96875, + "learning_rate": 0.013773655232004679, + "loss": 3.1559, + "mean_token_accuracy": 0.39475172758102417, + "num_tokens": 3953749352.0, + "step": 7735 + }, + { + "epoch": 2.091941590048675, + "grad_norm": 2.9375, + "learning_rate": 0.013772155433034956, + "loss": 3.0892, + "mean_token_accuracy": 0.3931719958782196, + "num_tokens": 3954236029.0, + "step": 7736 + }, + { + "epoch": 2.0922120064899947, + "grad_norm": 3.25, + "learning_rate": 0.013770655549007353, + "loss": 3.3933, + "mean_token_accuracy": 0.38142985105514526, + "num_tokens": 3954760269.0, + "step": 7737 + }, + { + "epoch": 2.0924824229313144, + "grad_norm": 3.125, + "learning_rate": 0.013769155579967886, + "loss": 3.2979, + "mean_token_accuracy": 0.3861130177974701, + "num_tokens": 3955242977.0, + "step": 7738 + }, + { + "epoch": 2.092752839372634, + "grad_norm": 2.734375, + "learning_rate": 0.013767655525962586, + "loss": 3.2221, + "mean_token_accuracy": 0.39457666873931885, + "num_tokens": 3955767227.0, + "step": 7739 + }, + { + "epoch": 2.0930232558139537, + "grad_norm": 3.078125, + "learning_rate": 0.013766155387037474, + "loss": 3.248, + "mean_token_accuracy": 0.3902932405471802, + "num_tokens": 3956291502.0, + "step": 7740 + }, + { + "epoch": 2.0932936722552733, + "grad_norm": 2.5, + "learning_rate": 0.013764655163238585, + "loss": 3.0459, + "mean_token_accuracy": 0.39065882563591003, + "num_tokens": 3956815684.0, + "step": 7741 + }, + { + "epoch": 2.093564088696593, + "grad_norm": 2.984375, + "learning_rate": 0.013763154854611938, + "loss": 3.2384, + "mean_token_accuracy": 0.37891632318496704, + "num_tokens": 3957339787.0, + "step": 7742 + }, + { + "epoch": 2.0938345051379126, + "grad_norm": 2.625, + "learning_rate": 0.013761654461203577, + "loss": 3.0629, + "mean_token_accuracy": 0.39926761388778687, + "num_tokens": 3957812205.0, + "step": 7743 + }, + { + "epoch": 2.0941049215792322, + "grad_norm": 2.59375, + "learning_rate": 0.013760153983059537, + "loss": 3.0768, + "mean_token_accuracy": 0.3939107358455658, + "num_tokens": 3958336401.0, + "step": 7744 + }, + { + "epoch": 2.0943753380205514, + "grad_norm": 2.5, + "learning_rate": 0.013758653420225853, + "loss": 3.1401, + "mean_token_accuracy": 0.4221588671207428, + "num_tokens": 3958806089.0, + "step": 7745 + }, + { + "epoch": 2.094645754461871, + "grad_norm": 2.5, + "learning_rate": 0.013757152772748568, + "loss": 3.1381, + "mean_token_accuracy": 0.3978700041770935, + "num_tokens": 3959330313.0, + "step": 7746 + }, + { + "epoch": 2.0949161709031907, + "grad_norm": 2.375, + "learning_rate": 0.013755652040673734, + "loss": 2.8944, + "mean_token_accuracy": 0.4057120084762573, + "num_tokens": 3959836425.0, + "step": 7747 + }, + { + "epoch": 2.0951865873445104, + "grad_norm": 2.734375, + "learning_rate": 0.01375415122404739, + "loss": 3.2717, + "mean_token_accuracy": 0.3734225034713745, + "num_tokens": 3960360597.0, + "step": 7748 + }, + { + "epoch": 2.09545700378583, + "grad_norm": 3.234375, + "learning_rate": 0.013752650322915583, + "loss": 3.142, + "mean_token_accuracy": 0.3915875256061554, + "num_tokens": 3960884788.0, + "step": 7749 + }, + { + "epoch": 2.0957274202271496, + "grad_norm": 2.828125, + "learning_rate": 0.01375114933732437, + "loss": 3.0701, + "mean_token_accuracy": 0.39757663011550903, + "num_tokens": 3961409030.0, + "step": 7750 + }, + { + "epoch": 2.0959978366684693, + "grad_norm": 20.25, + "learning_rate": 0.01374964826731981, + "loss": 13.1251, + "mean_token_accuracy": 0.00022485533554572612, + "num_tokens": 3961933216.0, + "step": 7751 + }, + { + "epoch": 2.096268253109789, + "grad_norm": 8.375, + "learning_rate": 0.013748147112947948, + "loss": 3.8015, + "mean_token_accuracy": 0.32118624448776245, + "num_tokens": 3962441264.0, + "step": 7752 + }, + { + "epoch": 2.0965386695511086, + "grad_norm": 29.25, + "learning_rate": 0.013746645874254853, + "loss": 3.404, + "mean_token_accuracy": 0.3746229410171509, + "num_tokens": 3962919442.0, + "step": 7753 + }, + { + "epoch": 2.096809085992428, + "grad_norm": 3.40625, + "learning_rate": 0.013745144551286587, + "loss": 3.375, + "mean_token_accuracy": 0.3667786717414856, + "num_tokens": 3963443553.0, + "step": 7754 + }, + { + "epoch": 2.097079502433748, + "grad_norm": 2.234375, + "learning_rate": 0.013743643144089211, + "loss": 3.4749, + "mean_token_accuracy": 0.364795058965683, + "num_tokens": 3963950867.0, + "step": 7755 + }, + { + "epoch": 2.0973499188750675, + "grad_norm": 3.0625, + "learning_rate": 0.013742141652708798, + "loss": 3.219, + "mean_token_accuracy": 0.38536888360977173, + "num_tokens": 3964459470.0, + "step": 7756 + }, + { + "epoch": 2.097620335316387, + "grad_norm": 2.34375, + "learning_rate": 0.013740640077191413, + "loss": 3.1667, + "mean_token_accuracy": 0.37922102212905884, + "num_tokens": 3964983729.0, + "step": 7757 + }, + { + "epoch": 2.097890751757707, + "grad_norm": 2.96875, + "learning_rate": 0.013739138417583131, + "loss": 3.2425, + "mean_token_accuracy": 0.38744837045669556, + "num_tokens": 3965507970.0, + "step": 7758 + }, + { + "epoch": 2.0981611681990264, + "grad_norm": 3.375, + "learning_rate": 0.013737636673930026, + "loss": 3.3326, + "mean_token_accuracy": 0.40145784616470337, + "num_tokens": 3966032236.0, + "step": 7759 + }, + { + "epoch": 2.098431584640346, + "grad_norm": 3.578125, + "learning_rate": 0.013736134846278178, + "loss": 3.1407, + "mean_token_accuracy": 0.42731034755706787, + "num_tokens": 3966509713.0, + "step": 7760 + }, + { + "epoch": 2.0987020010816657, + "grad_norm": 2.828125, + "learning_rate": 0.013734632934673665, + "loss": 3.3249, + "mean_token_accuracy": 0.3826959431171417, + "num_tokens": 3967033929.0, + "step": 7761 + }, + { + "epoch": 2.0989724175229854, + "grad_norm": 3.078125, + "learning_rate": 0.013733130939162572, + "loss": 3.0846, + "mean_token_accuracy": 0.40610772371292114, + "num_tokens": 3967540434.0, + "step": 7762 + }, + { + "epoch": 2.099242833964305, + "grad_norm": 3.09375, + "learning_rate": 0.013731628859790986, + "loss": 3.2526, + "mean_token_accuracy": 0.3988891839981079, + "num_tokens": 3968031770.0, + "step": 7763 + }, + { + "epoch": 2.0995132504056246, + "grad_norm": 2.9375, + "learning_rate": 0.013730126696604991, + "loss": 3.1774, + "mean_token_accuracy": 0.40516793727874756, + "num_tokens": 3968556006.0, + "step": 7764 + }, + { + "epoch": 2.0997836668469443, + "grad_norm": 3.484375, + "learning_rate": 0.013728624449650681, + "loss": 3.3587, + "mean_token_accuracy": 0.3932228088378906, + "num_tokens": 3969053968.0, + "step": 7765 + }, + { + "epoch": 2.100054083288264, + "grad_norm": 3.4375, + "learning_rate": 0.013727122118974151, + "loss": 3.3801, + "mean_token_accuracy": 0.3971472382545471, + "num_tokens": 3969570765.0, + "step": 7766 + }, + { + "epoch": 2.1003244997295836, + "grad_norm": 2.6875, + "learning_rate": 0.013725619704621491, + "loss": 3.2866, + "mean_token_accuracy": 0.3906896114349365, + "num_tokens": 3970094900.0, + "step": 7767 + }, + { + "epoch": 2.100594916170903, + "grad_norm": 3.09375, + "learning_rate": 0.013724117206638804, + "loss": 3.2968, + "mean_token_accuracy": 0.3990752100944519, + "num_tokens": 3970548396.0, + "step": 7768 + }, + { + "epoch": 2.100865332612223, + "grad_norm": 2.96875, + "learning_rate": 0.01372261462507219, + "loss": 3.3556, + "mean_token_accuracy": 0.4050595164299011, + "num_tokens": 3971019761.0, + "step": 7769 + }, + { + "epoch": 2.1011357490535425, + "grad_norm": 3.734375, + "learning_rate": 0.013721111959967756, + "loss": 3.3238, + "mean_token_accuracy": 0.3715773820877075, + "num_tokens": 3971544045.0, + "step": 7770 + }, + { + "epoch": 2.101406165494862, + "grad_norm": 40.25, + "learning_rate": 0.0137196092113716, + "loss": 17.2944, + "mean_token_accuracy": 0.006691320799291134, + "num_tokens": 3972068200.0, + "step": 7771 + }, + { + "epoch": 2.101676581936182, + "grad_norm": 10.125, + "learning_rate": 0.013718106379329837, + "loss": 3.4551, + "mean_token_accuracy": 0.3838617205619812, + "num_tokens": 3972592468.0, + "step": 7772 + }, + { + "epoch": 2.1019469983775014, + "grad_norm": 2.0625, + "learning_rate": 0.013716603463888578, + "loss": 3.2124, + "mean_token_accuracy": 0.39558184146881104, + "num_tokens": 3973077687.0, + "step": 7773 + }, + { + "epoch": 2.102217414818821, + "grad_norm": 2.296875, + "learning_rate": 0.013715100465093933, + "loss": 3.2162, + "mean_token_accuracy": 0.391175776720047, + "num_tokens": 3973591270.0, + "step": 7774 + }, + { + "epoch": 2.1024878312601407, + "grad_norm": 3.078125, + "learning_rate": 0.013713597382992023, + "loss": 3.1742, + "mean_token_accuracy": 0.3975762724876404, + "num_tokens": 3974115490.0, + "step": 7775 + }, + { + "epoch": 2.1027582477014604, + "grad_norm": 3.203125, + "learning_rate": 0.013712094217628965, + "loss": 3.0717, + "mean_token_accuracy": 0.39404749870300293, + "num_tokens": 3974639752.0, + "step": 7776 + }, + { + "epoch": 2.10302866414278, + "grad_norm": 2.875, + "learning_rate": 0.013710590969050885, + "loss": 3.1419, + "mean_token_accuracy": 0.39339059591293335, + "num_tokens": 3975163988.0, + "step": 7777 + }, + { + "epoch": 2.1032990805840996, + "grad_norm": 3.390625, + "learning_rate": 0.013709087637303897, + "loss": 3.1657, + "mean_token_accuracy": 0.3961316645145416, + "num_tokens": 3975688206.0, + "step": 7778 + }, + { + "epoch": 2.1035694970254193, + "grad_norm": 2.640625, + "learning_rate": 0.013707584222434135, + "loss": 3.1562, + "mean_token_accuracy": 0.3855706751346588, + "num_tokens": 3976159923.0, + "step": 7779 + }, + { + "epoch": 2.103839913466739, + "grad_norm": 2.828125, + "learning_rate": 0.013706080724487724, + "loss": 3.2619, + "mean_token_accuracy": 0.39301055669784546, + "num_tokens": 3976666824.0, + "step": 7780 + }, + { + "epoch": 2.1041103299080586, + "grad_norm": 2.53125, + "learning_rate": 0.013704577143510801, + "loss": 2.9774, + "mean_token_accuracy": 0.3957015872001648, + "num_tokens": 3977191083.0, + "step": 7781 + }, + { + "epoch": 2.104380746349378, + "grad_norm": 2.375, + "learning_rate": 0.013703073479549497, + "loss": 3.2051, + "mean_token_accuracy": 0.3970402479171753, + "num_tokens": 3977715290.0, + "step": 7782 + }, + { + "epoch": 2.104651162790698, + "grad_norm": 3.671875, + "learning_rate": 0.01370156973264995, + "loss": 3.2397, + "mean_token_accuracy": 0.3871806859970093, + "num_tokens": 3978239340.0, + "step": 7783 + }, + { + "epoch": 2.1049215792320175, + "grad_norm": 2.984375, + "learning_rate": 0.013700065902858301, + "loss": 3.1626, + "mean_token_accuracy": 0.3974176347255707, + "num_tokens": 3978763621.0, + "step": 7784 + }, + { + "epoch": 2.105191995673337, + "grad_norm": 3.515625, + "learning_rate": 0.013698561990220686, + "loss": 3.3598, + "mean_token_accuracy": 0.37718361616134644, + "num_tokens": 3979287897.0, + "step": 7785 + }, + { + "epoch": 2.1054624121146563, + "grad_norm": 3.578125, + "learning_rate": 0.013697057994783254, + "loss": 3.3365, + "mean_token_accuracy": 0.3795454502105713, + "num_tokens": 3979812170.0, + "step": 7786 + }, + { + "epoch": 2.105732828555976, + "grad_norm": 3.296875, + "learning_rate": 0.013695553916592154, + "loss": 3.1286, + "mean_token_accuracy": 0.4014163315296173, + "num_tokens": 3980313153.0, + "step": 7787 + }, + { + "epoch": 2.1060032449972956, + "grad_norm": 2.96875, + "learning_rate": 0.01369404975569353, + "loss": 3.1353, + "mean_token_accuracy": 0.4119042158126831, + "num_tokens": 3980837414.0, + "step": 7788 + }, + { + "epoch": 2.1062736614386153, + "grad_norm": 3.640625, + "learning_rate": 0.013692545512133536, + "loss": 3.2202, + "mean_token_accuracy": 0.391124427318573, + "num_tokens": 3981361389.0, + "step": 7789 + }, + { + "epoch": 2.106544077879935, + "grad_norm": 3.03125, + "learning_rate": 0.013691041185958328, + "loss": 3.1121, + "mean_token_accuracy": 0.41699469089508057, + "num_tokens": 3981885580.0, + "step": 7790 + }, + { + "epoch": 2.1068144943212546, + "grad_norm": 149.0, + "learning_rate": 0.013689536777214064, + "loss": 19.1014, + "mean_token_accuracy": 0.0028376085683703423, + "num_tokens": 3982409861.0, + "step": 7791 + }, + { + "epoch": 2.107084910762574, + "grad_norm": 6.71875, + "learning_rate": 0.0136880322859469, + "loss": 3.6521, + "mean_token_accuracy": 0.3234255313873291, + "num_tokens": 3982934124.0, + "step": 7792 + }, + { + "epoch": 2.107355327203894, + "grad_norm": 1.828125, + "learning_rate": 0.013686527712203005, + "loss": 3.0921, + "mean_token_accuracy": 0.3955020308494568, + "num_tokens": 3983458399.0, + "step": 7793 + }, + { + "epoch": 2.1076257436452135, + "grad_norm": 2.375, + "learning_rate": 0.013685023056028534, + "loss": 3.2852, + "mean_token_accuracy": 0.40287113189697266, + "num_tokens": 3983982531.0, + "step": 7794 + }, + { + "epoch": 2.107896160086533, + "grad_norm": 2.40625, + "learning_rate": 0.01368351831746966, + "loss": 2.9622, + "mean_token_accuracy": 0.3702574372291565, + "num_tokens": 3984506671.0, + "step": 7795 + }, + { + "epoch": 2.1081665765278528, + "grad_norm": 2.15625, + "learning_rate": 0.013682013496572554, + "loss": 3.1307, + "mean_token_accuracy": 0.403271347284317, + "num_tokens": 3985030921.0, + "step": 7796 + }, + { + "epoch": 2.1084369929691724, + "grad_norm": 2.921875, + "learning_rate": 0.013680508593383387, + "loss": 3.2211, + "mean_token_accuracy": 0.3803383708000183, + "num_tokens": 3985555187.0, + "step": 7797 + }, + { + "epoch": 2.108707409410492, + "grad_norm": 2.8125, + "learning_rate": 0.013679003607948334, + "loss": 3.2019, + "mean_token_accuracy": 0.40612736344337463, + "num_tokens": 3986079422.0, + "step": 7798 + }, + { + "epoch": 2.1089778258518117, + "grad_norm": 2.9375, + "learning_rate": 0.013677498540313571, + "loss": 3.3604, + "mean_token_accuracy": 0.368937224149704, + "num_tokens": 3986603619.0, + "step": 7799 + }, + { + "epoch": 2.1092482422931313, + "grad_norm": 2.625, + "learning_rate": 0.013675993390525281, + "loss": 3.2134, + "mean_token_accuracy": 0.3920243978500366, + "num_tokens": 3987127883.0, + "step": 7800 + }, + { + "epoch": 2.109518658734451, + "grad_norm": 2.875, + "learning_rate": 0.013674488158629641, + "loss": 3.286, + "mean_token_accuracy": 0.3848431706428528, + "num_tokens": 3987591518.0, + "step": 7801 + }, + { + "epoch": 2.1097890751757706, + "grad_norm": 2.6875, + "learning_rate": 0.01367298284467284, + "loss": 3.1538, + "mean_token_accuracy": 0.3762225806713104, + "num_tokens": 3988094419.0, + "step": 7802 + }, + { + "epoch": 2.1100594916170903, + "grad_norm": 2.9375, + "learning_rate": 0.013671477448701064, + "loss": 3.1174, + "mean_token_accuracy": 0.4298875331878662, + "num_tokens": 3988604605.0, + "step": 7803 + }, + { + "epoch": 2.11032990805841, + "grad_norm": 3.609375, + "learning_rate": 0.013669971970760504, + "loss": 3.3387, + "mean_token_accuracy": 0.3881511390209198, + "num_tokens": 3989077861.0, + "step": 7804 + }, + { + "epoch": 2.1106003244997296, + "grad_norm": 2.28125, + "learning_rate": 0.013668466410897353, + "loss": 3.0243, + "mean_token_accuracy": 0.3999728560447693, + "num_tokens": 3989528548.0, + "step": 7805 + }, + { + "epoch": 2.110870740941049, + "grad_norm": 3.140625, + "learning_rate": 0.013666960769157806, + "loss": 3.1066, + "mean_token_accuracy": 0.37742549180984497, + "num_tokens": 3990052751.0, + "step": 7806 + }, + { + "epoch": 2.111141157382369, + "grad_norm": 2.703125, + "learning_rate": 0.013665455045588058, + "loss": 3.1686, + "mean_token_accuracy": 0.42451614141464233, + "num_tokens": 3990481032.0, + "step": 7807 + }, + { + "epoch": 2.1114115738236885, + "grad_norm": 2.78125, + "learning_rate": 0.01366394924023431, + "loss": 3.3189, + "mean_token_accuracy": 0.36228740215301514, + "num_tokens": 3991005122.0, + "step": 7808 + }, + { + "epoch": 2.111681990265008, + "grad_norm": 2.984375, + "learning_rate": 0.013662443353142767, + "loss": 3.3198, + "mean_token_accuracy": 0.362156480550766, + "num_tokens": 3991529335.0, + "step": 7809 + }, + { + "epoch": 2.111952406706328, + "grad_norm": 2.546875, + "learning_rate": 0.013660937384359629, + "loss": 3.2497, + "mean_token_accuracy": 0.39341437816619873, + "num_tokens": 3992053493.0, + "step": 7810 + }, + { + "epoch": 2.1122228231476474, + "grad_norm": 32.5, + "learning_rate": 0.013659431333931106, + "loss": 11.7558, + "mean_token_accuracy": 0.01160544715821743, + "num_tokens": 3992577761.0, + "step": 7811 + }, + { + "epoch": 2.112493239588967, + "grad_norm": 38.75, + "learning_rate": 0.013657925201903412, + "loss": 3.9515, + "mean_token_accuracy": 0.2904203534126282, + "num_tokens": 3993102016.0, + "step": 7812 + }, + { + "epoch": 2.1127636560302867, + "grad_norm": 4.90625, + "learning_rate": 0.013656418988322759, + "loss": 3.8194, + "mean_token_accuracy": 0.2949511408805847, + "num_tokens": 3993626236.0, + "step": 7813 + }, + { + "epoch": 2.1130340724716064, + "grad_norm": 2.46875, + "learning_rate": 0.013654912693235354, + "loss": 3.6216, + "mean_token_accuracy": 0.3215827941894531, + "num_tokens": 3994150417.0, + "step": 7814 + }, + { + "epoch": 2.113304488912926, + "grad_norm": 2.59375, + "learning_rate": 0.013653406316687424, + "loss": 3.415, + "mean_token_accuracy": 0.3731899857521057, + "num_tokens": 3994669353.0, + "step": 7815 + }, + { + "epoch": 2.1135749053542456, + "grad_norm": 3.40625, + "learning_rate": 0.013651899858725183, + "loss": 3.3529, + "mean_token_accuracy": 0.39047589898109436, + "num_tokens": 3995144043.0, + "step": 7816 + }, + { + "epoch": 2.1138453217955653, + "grad_norm": 3.125, + "learning_rate": 0.013650393319394856, + "loss": 3.3255, + "mean_token_accuracy": 0.36028560996055603, + "num_tokens": 3995668234.0, + "step": 7817 + }, + { + "epoch": 2.114115738236885, + "grad_norm": 2.875, + "learning_rate": 0.013648886698742667, + "loss": 3.1677, + "mean_token_accuracy": 0.4041031002998352, + "num_tokens": 3996192348.0, + "step": 7818 + }, + { + "epoch": 2.1143861546782046, + "grad_norm": 3.25, + "learning_rate": 0.013647379996814844, + "loss": 2.9352, + "mean_token_accuracy": 0.44015514850616455, + "num_tokens": 3996716520.0, + "step": 7819 + }, + { + "epoch": 2.114656571119524, + "grad_norm": 3.296875, + "learning_rate": 0.01364587321365762, + "loss": 3.2842, + "mean_token_accuracy": 0.37780213356018066, + "num_tokens": 3997240696.0, + "step": 7820 + }, + { + "epoch": 2.114926987560844, + "grad_norm": 3.96875, + "learning_rate": 0.01364436634931722, + "loss": 3.0779, + "mean_token_accuracy": 0.3757663369178772, + "num_tokens": 3997764903.0, + "step": 7821 + }, + { + "epoch": 2.1151974040021635, + "grad_norm": 3.4375, + "learning_rate": 0.013642859403839889, + "loss": 3.2603, + "mean_token_accuracy": 0.3806658089160919, + "num_tokens": 3998289139.0, + "step": 7822 + }, + { + "epoch": 2.115467820443483, + "grad_norm": 3.578125, + "learning_rate": 0.013641352377271857, + "loss": 2.9591, + "mean_token_accuracy": 0.43725091218948364, + "num_tokens": 3998813313.0, + "step": 7823 + }, + { + "epoch": 2.115738236884803, + "grad_norm": 7.21875, + "learning_rate": 0.013639845269659365, + "loss": 3.2659, + "mean_token_accuracy": 0.3932531476020813, + "num_tokens": 3999337556.0, + "step": 7824 + }, + { + "epoch": 2.1160086533261224, + "grad_norm": 2.859375, + "learning_rate": 0.013638338081048657, + "loss": 3.3524, + "mean_token_accuracy": 0.3503568172454834, + "num_tokens": 3999861795.0, + "step": 7825 + }, + { + "epoch": 2.116279069767442, + "grad_norm": 3.953125, + "learning_rate": 0.01363683081148598, + "loss": 3.3088, + "mean_token_accuracy": 0.38905832171440125, + "num_tokens": 4000385978.0, + "step": 7826 + }, + { + "epoch": 2.1165494862087613, + "grad_norm": 2.890625, + "learning_rate": 0.01363532346101758, + "loss": 3.3452, + "mean_token_accuracy": 0.37732189893722534, + "num_tokens": 4000879114.0, + "step": 7827 + }, + { + "epoch": 2.1168199026500814, + "grad_norm": 3.4375, + "learning_rate": 0.013633816029689706, + "loss": 3.4505, + "mean_token_accuracy": 0.37965285778045654, + "num_tokens": 4001403327.0, + "step": 7828 + }, + { + "epoch": 2.1170903190914006, + "grad_norm": 2.296875, + "learning_rate": 0.013632308517548609, + "loss": 3.1561, + "mean_token_accuracy": 0.41043156385421753, + "num_tokens": 4001927518.0, + "step": 7829 + }, + { + "epoch": 2.11736073553272, + "grad_norm": 2.6875, + "learning_rate": 0.013630800924640543, + "loss": 3.1247, + "mean_token_accuracy": 0.3983529806137085, + "num_tokens": 4002451797.0, + "step": 7830 + }, + { + "epoch": 2.11763115197404, + "grad_norm": 35.5, + "learning_rate": 0.013629293251011771, + "loss": 15.2177, + "mean_token_accuracy": 0.0021211770363152027, + "num_tokens": 4002976018.0, + "step": 7831 + }, + { + "epoch": 2.1179015684153595, + "grad_norm": 6.5, + "learning_rate": 0.01362778549670855, + "loss": 3.5802, + "mean_token_accuracy": 0.34353721141815186, + "num_tokens": 4003500187.0, + "step": 7832 + }, + { + "epoch": 2.118171984856679, + "grad_norm": 1.8359375, + "learning_rate": 0.013626277661777143, + "loss": 3.2734, + "mean_token_accuracy": 0.3859521150588989, + "num_tokens": 4004024293.0, + "step": 7833 + }, + { + "epoch": 2.1184424012979988, + "grad_norm": 2.484375, + "learning_rate": 0.013624769746263811, + "loss": 3.2742, + "mean_token_accuracy": 0.3753325343132019, + "num_tokens": 4004548470.0, + "step": 7834 + }, + { + "epoch": 2.1187128177393184, + "grad_norm": 3.171875, + "learning_rate": 0.013623261750214825, + "loss": 3.1697, + "mean_token_accuracy": 0.38317322731018066, + "num_tokens": 4005072721.0, + "step": 7835 + }, + { + "epoch": 2.118983234180638, + "grad_norm": 2.578125, + "learning_rate": 0.013621753673676455, + "loss": 3.1813, + "mean_token_accuracy": 0.4068715572357178, + "num_tokens": 4005495578.0, + "step": 7836 + }, + { + "epoch": 2.1192536506219577, + "grad_norm": 3.265625, + "learning_rate": 0.013620245516694973, + "loss": 3.2887, + "mean_token_accuracy": 0.39360737800598145, + "num_tokens": 4006019830.0, + "step": 7837 + }, + { + "epoch": 2.1195240670632773, + "grad_norm": 4.25, + "learning_rate": 0.01361873727931665, + "loss": 3.3491, + "mean_token_accuracy": 0.38916563987731934, + "num_tokens": 4006544096.0, + "step": 7838 + }, + { + "epoch": 2.119794483504597, + "grad_norm": 3.0625, + "learning_rate": 0.013617228961587772, + "loss": 3.3576, + "mean_token_accuracy": 0.3851073980331421, + "num_tokens": 4007068370.0, + "step": 7839 + }, + { + "epoch": 2.1200648999459166, + "grad_norm": 3.59375, + "learning_rate": 0.013615720563554607, + "loss": 3.0588, + "mean_token_accuracy": 0.40576040744781494, + "num_tokens": 4007570983.0, + "step": 7840 + }, + { + "epoch": 2.1203353163872363, + "grad_norm": 2.703125, + "learning_rate": 0.013614212085263444, + "loss": 3.1248, + "mean_token_accuracy": 0.4066237211227417, + "num_tokens": 4008095028.0, + "step": 7841 + }, + { + "epoch": 2.120605732828556, + "grad_norm": 2.46875, + "learning_rate": 0.01361270352676057, + "loss": 2.9522, + "mean_token_accuracy": 0.4056743383407593, + "num_tokens": 4008619286.0, + "step": 7842 + }, + { + "epoch": 2.1208761492698756, + "grad_norm": 3.234375, + "learning_rate": 0.013611194888092265, + "loss": 3.4754, + "mean_token_accuracy": 0.37024950981140137, + "num_tokens": 4009143512.0, + "step": 7843 + }, + { + "epoch": 2.121146565711195, + "grad_norm": 2.859375, + "learning_rate": 0.013609686169304823, + "loss": 3.1474, + "mean_token_accuracy": 0.376888632774353, + "num_tokens": 4009644623.0, + "step": 7844 + }, + { + "epoch": 2.121416982152515, + "grad_norm": 2.46875, + "learning_rate": 0.01360817737044453, + "loss": 3.2859, + "mean_token_accuracy": 0.3872203826904297, + "num_tokens": 4010168825.0, + "step": 7845 + }, + { + "epoch": 2.1216873985938345, + "grad_norm": 3.0625, + "learning_rate": 0.01360666849155769, + "loss": 3.0433, + "mean_token_accuracy": 0.38931411504745483, + "num_tokens": 4010693101.0, + "step": 7846 + }, + { + "epoch": 2.121957815035154, + "grad_norm": 2.53125, + "learning_rate": 0.013605159532690593, + "loss": 2.9617, + "mean_token_accuracy": 0.4211633801460266, + "num_tokens": 4011174955.0, + "step": 7847 + }, + { + "epoch": 2.1222282314764738, + "grad_norm": 2.640625, + "learning_rate": 0.01360365049388954, + "loss": 3.0471, + "mean_token_accuracy": 0.40956658124923706, + "num_tokens": 4011699140.0, + "step": 7848 + }, + { + "epoch": 2.1224986479177934, + "grad_norm": 2.546875, + "learning_rate": 0.013602141375200836, + "loss": 3.1125, + "mean_token_accuracy": 0.39678096771240234, + "num_tokens": 4012223337.0, + "step": 7849 + }, + { + "epoch": 2.122769064359113, + "grad_norm": 3.3125, + "learning_rate": 0.01360063217667078, + "loss": 3.2457, + "mean_token_accuracy": 0.3883545398712158, + "num_tokens": 4012747579.0, + "step": 7850 + }, + { + "epoch": 2.1230394808004327, + "grad_norm": 504.0, + "learning_rate": 0.01359912289834568, + "loss": 13.3798, + "mean_token_accuracy": 0.01133904978632927, + "num_tokens": 4013271681.0, + "step": 7851 + }, + { + "epoch": 2.1233098972417523, + "grad_norm": 7.59375, + "learning_rate": 0.013597613540271848, + "loss": 3.7459, + "mean_token_accuracy": 0.35602158308029175, + "num_tokens": 4013795860.0, + "step": 7852 + }, + { + "epoch": 2.123580313683072, + "grad_norm": 2.4375, + "learning_rate": 0.013596104102495593, + "loss": 3.4541, + "mean_token_accuracy": 0.3739112615585327, + "num_tokens": 4014320137.0, + "step": 7853 + }, + { + "epoch": 2.1238507301243916, + "grad_norm": 2.9375, + "learning_rate": 0.013594594585063229, + "loss": 3.3414, + "mean_token_accuracy": 0.3776279091835022, + "num_tokens": 4014844394.0, + "step": 7854 + }, + { + "epoch": 2.1241211465657113, + "grad_norm": 2.59375, + "learning_rate": 0.013593084988021071, + "loss": 3.2047, + "mean_token_accuracy": 0.394683301448822, + "num_tokens": 4015368566.0, + "step": 7855 + }, + { + "epoch": 2.124391563007031, + "grad_norm": 2.65625, + "learning_rate": 0.013591575311415444, + "loss": 3.4524, + "mean_token_accuracy": 0.36516958475112915, + "num_tokens": 4015892849.0, + "step": 7856 + }, + { + "epoch": 2.1246619794483506, + "grad_norm": 3.109375, + "learning_rate": 0.013590065555292663, + "loss": 3.4089, + "mean_token_accuracy": 0.3574386239051819, + "num_tokens": 4016417033.0, + "step": 7857 + }, + { + "epoch": 2.12493239588967, + "grad_norm": 3.40625, + "learning_rate": 0.013588555719699054, + "loss": 3.1957, + "mean_token_accuracy": 0.41845184564590454, + "num_tokens": 4016842060.0, + "step": 7858 + }, + { + "epoch": 2.12520281233099, + "grad_norm": 3.34375, + "learning_rate": 0.013587045804680942, + "loss": 3.3625, + "mean_token_accuracy": 0.3574168086051941, + "num_tokens": 4017366315.0, + "step": 7859 + }, + { + "epoch": 2.1254732287723095, + "grad_norm": 2.53125, + "learning_rate": 0.013585535810284656, + "loss": 3.1494, + "mean_token_accuracy": 0.40497350692749023, + "num_tokens": 4017842859.0, + "step": 7860 + }, + { + "epoch": 2.125743645213629, + "grad_norm": 2.921875, + "learning_rate": 0.013584025736556529, + "loss": 3.2373, + "mean_token_accuracy": 0.37427300214767456, + "num_tokens": 4018346255.0, + "step": 7861 + }, + { + "epoch": 2.1260140616549488, + "grad_norm": 2.234375, + "learning_rate": 0.013582515583542896, + "loss": 3.2982, + "mean_token_accuracy": 0.40009427070617676, + "num_tokens": 4018870482.0, + "step": 7862 + }, + { + "epoch": 2.1262844780962684, + "grad_norm": 2.28125, + "learning_rate": 0.01358100535129009, + "loss": 3.1815, + "mean_token_accuracy": 0.3986833095550537, + "num_tokens": 4019394661.0, + "step": 7863 + }, + { + "epoch": 2.126554894537588, + "grad_norm": 2.625, + "learning_rate": 0.013579495039844448, + "loss": 3.1595, + "mean_token_accuracy": 0.3811435103416443, + "num_tokens": 4019918806.0, + "step": 7864 + }, + { + "epoch": 2.1268253109789077, + "grad_norm": 2.0625, + "learning_rate": 0.013577984649252311, + "loss": 3.0598, + "mean_token_accuracy": 0.4089723825454712, + "num_tokens": 4020443010.0, + "step": 7865 + }, + { + "epoch": 2.1270957274202273, + "grad_norm": 2.90625, + "learning_rate": 0.013576474179560023, + "loss": 2.9961, + "mean_token_accuracy": 0.4018062651157379, + "num_tokens": 4020967128.0, + "step": 7866 + }, + { + "epoch": 2.127366143861547, + "grad_norm": 2.6875, + "learning_rate": 0.01357496363081393, + "loss": 3.2808, + "mean_token_accuracy": 0.38109248876571655, + "num_tokens": 4021491298.0, + "step": 7867 + }, + { + "epoch": 2.127636560302866, + "grad_norm": 2.84375, + "learning_rate": 0.013573453003060381, + "loss": 3.2428, + "mean_token_accuracy": 0.3910449743270874, + "num_tokens": 4021978324.0, + "step": 7868 + }, + { + "epoch": 2.1279069767441863, + "grad_norm": 2.359375, + "learning_rate": 0.01357194229634573, + "loss": 3.2268, + "mean_token_accuracy": 0.407157838344574, + "num_tokens": 4022483741.0, + "step": 7869 + }, + { + "epoch": 2.1281773931855055, + "grad_norm": 3.109375, + "learning_rate": 0.01357043151071632, + "loss": 3.1189, + "mean_token_accuracy": 0.4082088768482208, + "num_tokens": 4022931736.0, + "step": 7870 + }, + { + "epoch": 2.128447809626825, + "grad_norm": 79.5, + "learning_rate": 0.013568920646218516, + "loss": 10.572, + "mean_token_accuracy": 0.012432435527443886, + "num_tokens": 4023456013.0, + "step": 7871 + }, + { + "epoch": 2.1287182260681448, + "grad_norm": 6.375, + "learning_rate": 0.013567409702898674, + "loss": 3.5564, + "mean_token_accuracy": 0.37447652220726013, + "num_tokens": 4023980259.0, + "step": 7872 + }, + { + "epoch": 2.1289886425094644, + "grad_norm": 1.6171875, + "learning_rate": 0.013565898680803146, + "loss": 3.2155, + "mean_token_accuracy": 0.3942318260669708, + "num_tokens": 4024444686.0, + "step": 7873 + }, + { + "epoch": 2.129259058950784, + "grad_norm": 2.71875, + "learning_rate": 0.013564387579978304, + "loss": 3.1913, + "mean_token_accuracy": 0.4010809361934662, + "num_tokens": 4024968795.0, + "step": 7874 + }, + { + "epoch": 2.1295294753921037, + "grad_norm": 3.71875, + "learning_rate": 0.013562876400470509, + "loss": 3.2468, + "mean_token_accuracy": 0.4001780152320862, + "num_tokens": 4025492973.0, + "step": 7875 + }, + { + "epoch": 2.1297998918334233, + "grad_norm": 3.5, + "learning_rate": 0.013561365142326127, + "loss": 3.2851, + "mean_token_accuracy": 0.38207921385765076, + "num_tokens": 4026017137.0, + "step": 7876 + }, + { + "epoch": 2.130070308274743, + "grad_norm": 5.34375, + "learning_rate": 0.013559853805591533, + "loss": 3.4271, + "mean_token_accuracy": 0.35706228017807007, + "num_tokens": 4026541314.0, + "step": 7877 + }, + { + "epoch": 2.1303407247160626, + "grad_norm": 2.625, + "learning_rate": 0.013558342390313094, + "loss": 3.0851, + "mean_token_accuracy": 0.4126955270767212, + "num_tokens": 4027040430.0, + "step": 7878 + }, + { + "epoch": 2.1306111411573823, + "grad_norm": 3.5, + "learning_rate": 0.01355683089653719, + "loss": 3.0036, + "mean_token_accuracy": 0.3905375599861145, + "num_tokens": 4027564662.0, + "step": 7879 + }, + { + "epoch": 2.130881557598702, + "grad_norm": 2.59375, + "learning_rate": 0.013555319324310191, + "loss": 3.0879, + "mean_token_accuracy": 0.4109759032726288, + "num_tokens": 4028047349.0, + "step": 7880 + }, + { + "epoch": 2.1311519740400215, + "grad_norm": 3.765625, + "learning_rate": 0.013553807673678482, + "loss": 3.3801, + "mean_token_accuracy": 0.38748592138290405, + "num_tokens": 4028498486.0, + "step": 7881 + }, + { + "epoch": 2.131422390481341, + "grad_norm": 2.515625, + "learning_rate": 0.013552295944688448, + "loss": 3.0156, + "mean_token_accuracy": 0.39980947971343994, + "num_tokens": 4029022677.0, + "step": 7882 + }, + { + "epoch": 2.131692806922661, + "grad_norm": 3.578125, + "learning_rate": 0.013550784137386463, + "loss": 3.08, + "mean_token_accuracy": 0.39189743995666504, + "num_tokens": 4029546803.0, + "step": 7883 + }, + { + "epoch": 2.1319632233639805, + "grad_norm": 2.578125, + "learning_rate": 0.013549272251818921, + "loss": 3.0712, + "mean_token_accuracy": 0.4124358892440796, + "num_tokens": 4030015340.0, + "step": 7884 + }, + { + "epoch": 2.1322336398053, + "grad_norm": 3.015625, + "learning_rate": 0.013547760288032212, + "loss": 3.0596, + "mean_token_accuracy": 0.4004517197608948, + "num_tokens": 4030476604.0, + "step": 7885 + }, + { + "epoch": 2.1325040562466198, + "grad_norm": 3.03125, + "learning_rate": 0.013546248246072718, + "loss": 3.1713, + "mean_token_accuracy": 0.4011874198913574, + "num_tokens": 4031000882.0, + "step": 7886 + }, + { + "epoch": 2.1327744726879394, + "grad_norm": 3.296875, + "learning_rate": 0.013544736125986846, + "loss": 3.2443, + "mean_token_accuracy": 0.38092952966690063, + "num_tokens": 4031525130.0, + "step": 7887 + }, + { + "epoch": 2.133044889129259, + "grad_norm": 3.21875, + "learning_rate": 0.013543223927820982, + "loss": 3.3125, + "mean_token_accuracy": 0.3806946873664856, + "num_tokens": 4032049250.0, + "step": 7888 + }, + { + "epoch": 2.1333153055705787, + "grad_norm": 5.90625, + "learning_rate": 0.01354171165162153, + "loss": 2.9217, + "mean_token_accuracy": 0.43488964438438416, + "num_tokens": 4032573400.0, + "step": 7889 + }, + { + "epoch": 2.1335857220118983, + "grad_norm": 1.9609375, + "learning_rate": 0.01354019929743489, + "loss": 3.2293, + "mean_token_accuracy": 0.3824729919433594, + "num_tokens": 4033097678.0, + "step": 7890 + }, + { + "epoch": 2.133856138453218, + "grad_norm": 0.73828125, + "learning_rate": 0.013538686865307467, + "loss": 11.1056, + "mean_token_accuracy": 1.2713961041299626e-05, + "num_tokens": 4033621839.0, + "step": 7891 + }, + { + "epoch": 2.1341265548945376, + "grad_norm": 11.0625, + "learning_rate": 0.013537174355285665, + "loss": 3.8407, + "mean_token_accuracy": 0.35675978660583496, + "num_tokens": 4034078568.0, + "step": 7892 + }, + { + "epoch": 2.1343969713358573, + "grad_norm": 3.3125, + "learning_rate": 0.01353566176741589, + "loss": 3.5739, + "mean_token_accuracy": 0.38325777649879456, + "num_tokens": 4034602844.0, + "step": 7893 + }, + { + "epoch": 2.134667387777177, + "grad_norm": 3.078125, + "learning_rate": 0.013534149101744558, + "loss": 3.3836, + "mean_token_accuracy": 0.37506216764450073, + "num_tokens": 4035092524.0, + "step": 7894 + }, + { + "epoch": 2.1349378042184965, + "grad_norm": 3.59375, + "learning_rate": 0.013532636358318077, + "loss": 3.2631, + "mean_token_accuracy": 0.39837414026260376, + "num_tokens": 4035559659.0, + "step": 7895 + }, + { + "epoch": 2.135208220659816, + "grad_norm": 3.78125, + "learning_rate": 0.013531123537182863, + "loss": 3.3376, + "mean_token_accuracy": 0.3752535581588745, + "num_tokens": 4036083901.0, + "step": 7896 + }, + { + "epoch": 2.135478637101136, + "grad_norm": 3.015625, + "learning_rate": 0.013529610638385339, + "loss": 3.1944, + "mean_token_accuracy": 0.3917458951473236, + "num_tokens": 4036608175.0, + "step": 7897 + }, + { + "epoch": 2.1357490535424555, + "grad_norm": 3.671875, + "learning_rate": 0.013528097661971921, + "loss": 3.1714, + "mean_token_accuracy": 0.37573251128196716, + "num_tokens": 4037132419.0, + "step": 7898 + }, + { + "epoch": 2.136019469983775, + "grad_norm": 2.65625, + "learning_rate": 0.01352658460798903, + "loss": 3.2293, + "mean_token_accuracy": 0.3738613724708557, + "num_tokens": 4037656632.0, + "step": 7899 + }, + { + "epoch": 2.1362898864250948, + "grad_norm": 2.625, + "learning_rate": 0.013525071476483092, + "loss": 3.1915, + "mean_token_accuracy": 0.3808107078075409, + "num_tokens": 4038180648.0, + "step": 7900 + }, + { + "epoch": 2.1365603028664144, + "grad_norm": 3.90625, + "learning_rate": 0.01352355826750054, + "loss": 3.2868, + "mean_token_accuracy": 0.3832472860813141, + "num_tokens": 4038704421.0, + "step": 7901 + }, + { + "epoch": 2.136830719307734, + "grad_norm": 3.40625, + "learning_rate": 0.013522044981087795, + "loss": 3.4717, + "mean_token_accuracy": 0.39222538471221924, + "num_tokens": 4039180296.0, + "step": 7902 + }, + { + "epoch": 2.1371011357490537, + "grad_norm": 2.734375, + "learning_rate": 0.013520531617291293, + "loss": 3.1556, + "mean_token_accuracy": 0.3784233033657074, + "num_tokens": 4039704562.0, + "step": 7903 + }, + { + "epoch": 2.1373715521903733, + "grad_norm": 4.90625, + "learning_rate": 0.013519018176157471, + "loss": 3.2624, + "mean_token_accuracy": 0.3775830864906311, + "num_tokens": 4040171122.0, + "step": 7904 + }, + { + "epoch": 2.137641968631693, + "grad_norm": 2.015625, + "learning_rate": 0.013517504657732767, + "loss": 3.235, + "mean_token_accuracy": 0.41072648763656616, + "num_tokens": 4040662261.0, + "step": 7905 + }, + { + "epoch": 2.1379123850730126, + "grad_norm": 2.3125, + "learning_rate": 0.01351599106206361, + "loss": 3.0221, + "mean_token_accuracy": 0.401347279548645, + "num_tokens": 4041186347.0, + "step": 7906 + }, + { + "epoch": 2.1381828015143323, + "grad_norm": 2.5, + "learning_rate": 0.01351447738919645, + "loss": 3.07, + "mean_token_accuracy": 0.4027513563632965, + "num_tokens": 4041710605.0, + "step": 7907 + }, + { + "epoch": 2.138453217955652, + "grad_norm": 2.640625, + "learning_rate": 0.013512963639177727, + "loss": 3.3248, + "mean_token_accuracy": 0.38222599029541016, + "num_tokens": 4042234767.0, + "step": 7908 + }, + { + "epoch": 2.138723634396971, + "grad_norm": 3.40625, + "learning_rate": 0.013511449812053893, + "loss": 3.3053, + "mean_token_accuracy": 0.38731417059898376, + "num_tokens": 4042758950.0, + "step": 7909 + }, + { + "epoch": 2.138994050838291, + "grad_norm": 3.234375, + "learning_rate": 0.013509935907871393, + "loss": 3.2184, + "mean_token_accuracy": 0.39664125442504883, + "num_tokens": 4043223774.0, + "step": 7910 + }, + { + "epoch": 2.1392644672796104, + "grad_norm": 195.0, + "learning_rate": 0.013508421926676678, + "loss": 12.8369, + "mean_token_accuracy": 0.0003816456301137805, + "num_tokens": 4043748038.0, + "step": 7911 + }, + { + "epoch": 2.13953488372093, + "grad_norm": 6.5, + "learning_rate": 0.013506907868516198, + "loss": 3.7655, + "mean_token_accuracy": 0.33493977785110474, + "num_tokens": 4044272211.0, + "step": 7912 + }, + { + "epoch": 2.1398053001622497, + "grad_norm": 3.28125, + "learning_rate": 0.013505393733436417, + "loss": 3.1993, + "mean_token_accuracy": 0.39757785201072693, + "num_tokens": 4044796306.0, + "step": 7913 + }, + { + "epoch": 2.1400757166035693, + "grad_norm": 3.734375, + "learning_rate": 0.013503879521483787, + "loss": 3.2063, + "mean_token_accuracy": 0.38160014152526855, + "num_tokens": 4045320491.0, + "step": 7914 + }, + { + "epoch": 2.140346133044889, + "grad_norm": 3.1875, + "learning_rate": 0.013502365232704766, + "loss": 3.2793, + "mean_token_accuracy": 0.35957568883895874, + "num_tokens": 4045844672.0, + "step": 7915 + }, + { + "epoch": 2.1406165494862086, + "grad_norm": 2.75, + "learning_rate": 0.013500850867145824, + "loss": 3.0736, + "mean_token_accuracy": 0.4022439122200012, + "num_tokens": 4046368921.0, + "step": 7916 + }, + { + "epoch": 2.1408869659275283, + "grad_norm": 3.046875, + "learning_rate": 0.01349933642485342, + "loss": 3.3523, + "mean_token_accuracy": 0.33835068345069885, + "num_tokens": 4046846198.0, + "step": 7917 + }, + { + "epoch": 2.141157382368848, + "grad_norm": 2.421875, + "learning_rate": 0.013497821905874032, + "loss": 2.9793, + "mean_token_accuracy": 0.40753495693206787, + "num_tokens": 4047370482.0, + "step": 7918 + }, + { + "epoch": 2.1414277988101675, + "grad_norm": 2.703125, + "learning_rate": 0.013496307310254115, + "loss": 3.025, + "mean_token_accuracy": 0.3886718153953552, + "num_tokens": 4047894648.0, + "step": 7919 + }, + { + "epoch": 2.141698215251487, + "grad_norm": 2.328125, + "learning_rate": 0.013494792638040152, + "loss": 3.0993, + "mean_token_accuracy": 0.39086347818374634, + "num_tokens": 4048418876.0, + "step": 7920 + }, + { + "epoch": 2.141968631692807, + "grad_norm": 2.453125, + "learning_rate": 0.013493277889278614, + "loss": 2.9963, + "mean_token_accuracy": 0.40875738859176636, + "num_tokens": 4048943108.0, + "step": 7921 + }, + { + "epoch": 2.1422390481341265, + "grad_norm": 2.640625, + "learning_rate": 0.013491763064015977, + "loss": 3.1783, + "mean_token_accuracy": 0.39351144433021545, + "num_tokens": 4049443746.0, + "step": 7922 + }, + { + "epoch": 2.142509464575446, + "grad_norm": 3.296875, + "learning_rate": 0.01349024816229872, + "loss": 3.3246, + "mean_token_accuracy": 0.36130136251449585, + "num_tokens": 4049967958.0, + "step": 7923 + }, + { + "epoch": 2.1427798810167658, + "grad_norm": 3.015625, + "learning_rate": 0.013488733184173327, + "loss": 3.316, + "mean_token_accuracy": 0.3823903203010559, + "num_tokens": 4050492214.0, + "step": 7924 + }, + { + "epoch": 2.1430502974580854, + "grad_norm": 3.09375, + "learning_rate": 0.013487218129686283, + "loss": 3.1193, + "mean_token_accuracy": 0.4291916489601135, + "num_tokens": 4050952641.0, + "step": 7925 + }, + { + "epoch": 2.143320713899405, + "grad_norm": 2.28125, + "learning_rate": 0.013485702998884071, + "loss": 3.2906, + "mean_token_accuracy": 0.3910057544708252, + "num_tokens": 4051476891.0, + "step": 7926 + }, + { + "epoch": 2.1435911303407247, + "grad_norm": 3.09375, + "learning_rate": 0.013484187791813181, + "loss": 3.2833, + "mean_token_accuracy": 0.3793990910053253, + "num_tokens": 4052001051.0, + "step": 7927 + }, + { + "epoch": 2.1438615467820443, + "grad_norm": 2.875, + "learning_rate": 0.013482672508520106, + "loss": 3.2207, + "mean_token_accuracy": 0.38299253582954407, + "num_tokens": 4052525319.0, + "step": 7928 + }, + { + "epoch": 2.144131963223364, + "grad_norm": 3.078125, + "learning_rate": 0.013481157149051333, + "loss": 3.1612, + "mean_token_accuracy": 0.36774247884750366, + "num_tokens": 4053049509.0, + "step": 7929 + }, + { + "epoch": 2.1444023796646836, + "grad_norm": 2.796875, + "learning_rate": 0.013479641713453363, + "loss": 3.2831, + "mean_token_accuracy": 0.3950127959251404, + "num_tokens": 4053573731.0, + "step": 7930 + }, + { + "epoch": 2.1446727961060033, + "grad_norm": 68.0, + "learning_rate": 0.013478126201772696, + "loss": 10.1415, + "mean_token_accuracy": 0.010949080809950829, + "num_tokens": 4054097993.0, + "step": 7931 + }, + { + "epoch": 2.144943212547323, + "grad_norm": 8.0, + "learning_rate": 0.013476610614055829, + "loss": 3.3716, + "mean_token_accuracy": 0.3289634585380554, + "num_tokens": 4054622122.0, + "step": 7932 + }, + { + "epoch": 2.1452136289886425, + "grad_norm": 2.6875, + "learning_rate": 0.013475094950349263, + "loss": 3.3224, + "mean_token_accuracy": 0.3765464425086975, + "num_tokens": 4055118373.0, + "step": 7933 + }, + { + "epoch": 2.145484045429962, + "grad_norm": 2.96875, + "learning_rate": 0.013473579210699508, + "loss": 3.1994, + "mean_token_accuracy": 0.3596952259540558, + "num_tokens": 4055642594.0, + "step": 7934 + }, + { + "epoch": 2.145754461871282, + "grad_norm": 2.953125, + "learning_rate": 0.013472063395153064, + "loss": 3.3411, + "mean_token_accuracy": 0.37909865379333496, + "num_tokens": 4056166847.0, + "step": 7935 + }, + { + "epoch": 2.1460248783126015, + "grad_norm": 3.171875, + "learning_rate": 0.013470547503756447, + "loss": 2.9091, + "mean_token_accuracy": 0.4043576121330261, + "num_tokens": 4056691097.0, + "step": 7936 + }, + { + "epoch": 2.146295294753921, + "grad_norm": 2.921875, + "learning_rate": 0.013469031536556169, + "loss": 3.1819, + "mean_token_accuracy": 0.39774003624916077, + "num_tokens": 4057215378.0, + "step": 7937 + }, + { + "epoch": 2.1465657111952408, + "grad_norm": 2.71875, + "learning_rate": 0.013467515493598743, + "loss": 3.1778, + "mean_token_accuracy": 0.38601523637771606, + "num_tokens": 4057739652.0, + "step": 7938 + }, + { + "epoch": 2.1468361276365604, + "grad_norm": 2.828125, + "learning_rate": 0.013465999374930681, + "loss": 3.3468, + "mean_token_accuracy": 0.35512927174568176, + "num_tokens": 4058263875.0, + "step": 7939 + }, + { + "epoch": 2.14710654407788, + "grad_norm": 2.921875, + "learning_rate": 0.013464483180598508, + "loss": 3.0956, + "mean_token_accuracy": 0.3911796808242798, + "num_tokens": 4058788089.0, + "step": 7940 + }, + { + "epoch": 2.1473769605191997, + "grad_norm": 2.703125, + "learning_rate": 0.013462966910648749, + "loss": 3.2186, + "mean_token_accuracy": 0.403414785861969, + "num_tokens": 4059312306.0, + "step": 7941 + }, + { + "epoch": 2.1476473769605193, + "grad_norm": 3.015625, + "learning_rate": 0.013461450565127915, + "loss": 3.1424, + "mean_token_accuracy": 0.4040406346321106, + "num_tokens": 4059790500.0, + "step": 7942 + }, + { + "epoch": 2.147917793401839, + "grad_norm": 2.640625, + "learning_rate": 0.013459934144082543, + "loss": 3.0275, + "mean_token_accuracy": 0.42029863595962524, + "num_tokens": 4060263479.0, + "step": 7943 + }, + { + "epoch": 2.1481882098431586, + "grad_norm": 2.90625, + "learning_rate": 0.013458417647559154, + "loss": 3.2385, + "mean_token_accuracy": 0.3932300806045532, + "num_tokens": 4060787673.0, + "step": 7944 + }, + { + "epoch": 2.1484586262844783, + "grad_norm": 2.5, + "learning_rate": 0.013456901075604282, + "loss": 3.0855, + "mean_token_accuracy": 0.4063856303691864, + "num_tokens": 4061263314.0, + "step": 7945 + }, + { + "epoch": 2.148729042725798, + "grad_norm": 2.9375, + "learning_rate": 0.013455384428264461, + "loss": 3.0645, + "mean_token_accuracy": 0.40511637926101685, + "num_tokens": 4061753843.0, + "step": 7946 + }, + { + "epoch": 2.1489994591671175, + "grad_norm": 8.375, + "learning_rate": 0.013453867705586226, + "loss": 3.0501, + "mean_token_accuracy": 0.41338688135147095, + "num_tokens": 4062278113.0, + "step": 7947 + }, + { + "epoch": 2.149269875608437, + "grad_norm": 2.125, + "learning_rate": 0.013452350907616113, + "loss": 3.1996, + "mean_token_accuracy": 0.4054219126701355, + "num_tokens": 4062802313.0, + "step": 7948 + }, + { + "epoch": 2.149540292049757, + "grad_norm": 3.28125, + "learning_rate": 0.013450834034400663, + "loss": 3.2948, + "mean_token_accuracy": 0.4156520366668701, + "num_tokens": 4063272614.0, + "step": 7949 + }, + { + "epoch": 2.149810708491076, + "grad_norm": 3.453125, + "learning_rate": 0.013449317085986415, + "loss": 3.24, + "mean_token_accuracy": 0.39329850673675537, + "num_tokens": 4063796896.0, + "step": 7950 + }, + { + "epoch": 2.150081124932396, + "grad_norm": 2.34375, + "learning_rate": 0.013447800062419918, + "loss": 10.9921, + "mean_token_accuracy": 1.0096318874275312e-05, + "num_tokens": 4064321130.0, + "step": 7951 + }, + { + "epoch": 2.1503515413737153, + "grad_norm": 8.125, + "learning_rate": 0.013446282963747714, + "loss": 3.4957, + "mean_token_accuracy": 0.3743504285812378, + "num_tokens": 4064771466.0, + "step": 7952 + }, + { + "epoch": 2.150621957815035, + "grad_norm": 3.390625, + "learning_rate": 0.013444765790016355, + "loss": 3.3358, + "mean_token_accuracy": 0.364715039730072, + "num_tokens": 4065295737.0, + "step": 7953 + }, + { + "epoch": 2.1508923742563546, + "grad_norm": 2.53125, + "learning_rate": 0.013443248541272396, + "loss": 3.2517, + "mean_token_accuracy": 0.39300647377967834, + "num_tokens": 4065819996.0, + "step": 7954 + }, + { + "epoch": 2.1511627906976742, + "grad_norm": 3.78125, + "learning_rate": 0.013441731217562385, + "loss": 3.2224, + "mean_token_accuracy": 0.3805311918258667, + "num_tokens": 4066344097.0, + "step": 7955 + }, + { + "epoch": 2.151433207138994, + "grad_norm": 2.046875, + "learning_rate": 0.013440213818932878, + "loss": 3.2409, + "mean_token_accuracy": 0.4088212549686432, + "num_tokens": 4066856676.0, + "step": 7956 + }, + { + "epoch": 2.1517036235803135, + "grad_norm": 2.84375, + "learning_rate": 0.013438696345430434, + "loss": 3.1943, + "mean_token_accuracy": 0.3806198239326477, + "num_tokens": 4067380952.0, + "step": 7957 + }, + { + "epoch": 2.151974040021633, + "grad_norm": 2.640625, + "learning_rate": 0.013437178797101615, + "loss": 3.2231, + "mean_token_accuracy": 0.393964022397995, + "num_tokens": 4067905115.0, + "step": 7958 + }, + { + "epoch": 2.152244456462953, + "grad_norm": 4.03125, + "learning_rate": 0.013435661173992983, + "loss": 3.1908, + "mean_token_accuracy": 0.353931188583374, + "num_tokens": 4068383682.0, + "step": 7959 + }, + { + "epoch": 2.1525148729042725, + "grad_norm": 11.4375, + "learning_rate": 0.013434143476151104, + "loss": 3.0309, + "mean_token_accuracy": 0.43781906366348267, + "num_tokens": 4068907927.0, + "step": 7960 + }, + { + "epoch": 2.152785289345592, + "grad_norm": 2.03125, + "learning_rate": 0.013432625703622548, + "loss": 3.137, + "mean_token_accuracy": 0.3961067795753479, + "num_tokens": 4069432133.0, + "step": 7961 + }, + { + "epoch": 2.1530557057869117, + "grad_norm": 3.484375, + "learning_rate": 0.013431107856453878, + "loss": 3.2174, + "mean_token_accuracy": 0.37569713592529297, + "num_tokens": 4069956318.0, + "step": 7962 + }, + { + "epoch": 2.1533261222282314, + "grad_norm": 3.71875, + "learning_rate": 0.013429589934691668, + "loss": 3.0019, + "mean_token_accuracy": 0.40894171595573425, + "num_tokens": 4070480505.0, + "step": 7963 + }, + { + "epoch": 2.153596538669551, + "grad_norm": 3.140625, + "learning_rate": 0.013428071938382498, + "loss": 3.1846, + "mean_token_accuracy": 0.407680869102478, + "num_tokens": 4070951352.0, + "step": 7964 + }, + { + "epoch": 2.1538669551108707, + "grad_norm": 3.109375, + "learning_rate": 0.013426553867572934, + "loss": 3.0913, + "mean_token_accuracy": 0.4151765704154968, + "num_tokens": 4071475438.0, + "step": 7965 + }, + { + "epoch": 2.1541373715521903, + "grad_norm": 3.765625, + "learning_rate": 0.013425035722309568, + "loss": 3.2337, + "mean_token_accuracy": 0.40289944410324097, + "num_tokens": 4071999711.0, + "step": 7966 + }, + { + "epoch": 2.15440778799351, + "grad_norm": 9.375, + "learning_rate": 0.013423517502638973, + "loss": 3.0535, + "mean_token_accuracy": 0.4175484776496887, + "num_tokens": 4072523886.0, + "step": 7967 + }, + { + "epoch": 2.1546782044348296, + "grad_norm": 2.203125, + "learning_rate": 0.013421999208607734, + "loss": 3.4377, + "mean_token_accuracy": 0.3496018648147583, + "num_tokens": 4073036093.0, + "step": 7968 + }, + { + "epoch": 2.1549486208761492, + "grad_norm": 2.15625, + "learning_rate": 0.013420480840262435, + "loss": 3.0241, + "mean_token_accuracy": 0.3949812948703766, + "num_tokens": 4073560282.0, + "step": 7969 + }, + { + "epoch": 2.155219037317469, + "grad_norm": 2.734375, + "learning_rate": 0.013418962397649668, + "loss": 3.14, + "mean_token_accuracy": 0.3736467957496643, + "num_tokens": 4074084568.0, + "step": 7970 + }, + { + "epoch": 2.1554894537587885, + "grad_norm": 54.75, + "learning_rate": 0.013417443880816018, + "loss": 12.35, + "mean_token_accuracy": 0.00574874971061945, + "num_tokens": 4074608615.0, + "step": 7971 + }, + { + "epoch": 2.155759870200108, + "grad_norm": 6.65625, + "learning_rate": 0.013415925289808084, + "loss": 3.9256, + "mean_token_accuracy": 0.3506108820438385, + "num_tokens": 4075089059.0, + "step": 7972 + }, + { + "epoch": 2.156030286641428, + "grad_norm": 2.796875, + "learning_rate": 0.013414406624672453, + "loss": 3.5083, + "mean_token_accuracy": 0.36775341629981995, + "num_tokens": 4075613161.0, + "step": 7973 + }, + { + "epoch": 2.1563007030827475, + "grad_norm": 3.09375, + "learning_rate": 0.01341288788545573, + "loss": 3.414, + "mean_token_accuracy": 0.36705493927001953, + "num_tokens": 4076137351.0, + "step": 7974 + }, + { + "epoch": 2.156571119524067, + "grad_norm": 2.9375, + "learning_rate": 0.013411369072204507, + "loss": 3.3901, + "mean_token_accuracy": 0.36613500118255615, + "num_tokens": 4076661635.0, + "step": 7975 + }, + { + "epoch": 2.1568415359653867, + "grad_norm": 2.484375, + "learning_rate": 0.013409850184965391, + "loss": 3.0195, + "mean_token_accuracy": 0.41002485156059265, + "num_tokens": 4077134324.0, + "step": 7976 + }, + { + "epoch": 2.1571119524067064, + "grad_norm": 2.71875, + "learning_rate": 0.013408331223784988, + "loss": 3.2801, + "mean_token_accuracy": 0.3805384039878845, + "num_tokens": 4077658604.0, + "step": 7977 + }, + { + "epoch": 2.157382368848026, + "grad_norm": 2.15625, + "learning_rate": 0.013406812188709899, + "loss": 2.7924, + "mean_token_accuracy": 0.41305840015411377, + "num_tokens": 4078182830.0, + "step": 7978 + }, + { + "epoch": 2.1576527852893457, + "grad_norm": 2.546875, + "learning_rate": 0.013405293079786731, + "loss": 3.3558, + "mean_token_accuracy": 0.3882375657558441, + "num_tokens": 4078707065.0, + "step": 7979 + }, + { + "epoch": 2.1579232017306653, + "grad_norm": 4.03125, + "learning_rate": 0.0134037738970621, + "loss": 3.282, + "mean_token_accuracy": 0.3749086260795593, + "num_tokens": 4079231324.0, + "step": 7980 + }, + { + "epoch": 2.158193618171985, + "grad_norm": 2.296875, + "learning_rate": 0.013402254640582617, + "loss": 3.0932, + "mean_token_accuracy": 0.40582409501075745, + "num_tokens": 4079704375.0, + "step": 7981 + }, + { + "epoch": 2.1584640346133046, + "grad_norm": 2.796875, + "learning_rate": 0.013400735310394896, + "loss": 3.2961, + "mean_token_accuracy": 0.38265636563301086, + "num_tokens": 4080228639.0, + "step": 7982 + }, + { + "epoch": 2.1587344510546242, + "grad_norm": 3.109375, + "learning_rate": 0.013399215906545555, + "loss": 3.315, + "mean_token_accuracy": 0.402908593416214, + "num_tokens": 4080703532.0, + "step": 7983 + }, + { + "epoch": 2.159004867495944, + "grad_norm": 3.28125, + "learning_rate": 0.013397696429081219, + "loss": 3.1608, + "mean_token_accuracy": 0.3748003840446472, + "num_tokens": 4081227637.0, + "step": 7984 + }, + { + "epoch": 2.1592752839372635, + "grad_norm": 4.4375, + "learning_rate": 0.013396176878048501, + "loss": 3.3209, + "mean_token_accuracy": 0.3821365237236023, + "num_tokens": 4081751779.0, + "step": 7985 + }, + { + "epoch": 2.159545700378583, + "grad_norm": 3.0625, + "learning_rate": 0.013394657253494029, + "loss": 3.247, + "mean_token_accuracy": 0.4075143337249756, + "num_tokens": 4082276036.0, + "step": 7986 + }, + { + "epoch": 2.159816116819903, + "grad_norm": 2.734375, + "learning_rate": 0.013393137555464433, + "loss": 3.1501, + "mean_token_accuracy": 0.3954535722732544, + "num_tokens": 4082800254.0, + "step": 7987 + }, + { + "epoch": 2.1600865332612225, + "grad_norm": 23.125, + "learning_rate": 0.013391617784006339, + "loss": 3.0848, + "mean_token_accuracy": 0.42380204796791077, + "num_tokens": 4083300160.0, + "step": 7988 + }, + { + "epoch": 2.160356949702542, + "grad_norm": 4.5625, + "learning_rate": 0.013390097939166375, + "loss": 3.449, + "mean_token_accuracy": 0.3763355016708374, + "num_tokens": 4083824400.0, + "step": 7989 + }, + { + "epoch": 2.1606273661438617, + "grad_norm": 2.375, + "learning_rate": 0.013388578020991183, + "loss": 3.1289, + "mean_token_accuracy": 0.38678067922592163, + "num_tokens": 4084348665.0, + "step": 7990 + }, + { + "epoch": 2.160897782585181, + "grad_norm": 8.125, + "learning_rate": 0.013387058029527388, + "loss": 10.238, + "mean_token_accuracy": 0.0010437910677865148, + "num_tokens": 4084872916.0, + "step": 7991 + }, + { + "epoch": 2.161168199026501, + "grad_norm": 5.40625, + "learning_rate": 0.013385537964821632, + "loss": 3.5153, + "mean_token_accuracy": 0.3590601086616516, + "num_tokens": 4085397135.0, + "step": 7992 + }, + { + "epoch": 2.1614386154678202, + "grad_norm": 2.34375, + "learning_rate": 0.013384017826920558, + "loss": 3.4288, + "mean_token_accuracy": 0.3524368405342102, + "num_tokens": 4085885608.0, + "step": 7993 + }, + { + "epoch": 2.16170903190914, + "grad_norm": 2.546875, + "learning_rate": 0.013382497615870808, + "loss": 3.3011, + "mean_token_accuracy": 0.3775060772895813, + "num_tokens": 4086409838.0, + "step": 7994 + }, + { + "epoch": 2.1619794483504595, + "grad_norm": 4.0, + "learning_rate": 0.013380977331719023, + "loss": 3.2144, + "mean_token_accuracy": 0.38076648116111755, + "num_tokens": 4086913353.0, + "step": 7995 + }, + { + "epoch": 2.162249864791779, + "grad_norm": 2.09375, + "learning_rate": 0.01337945697451185, + "loss": 3.3578, + "mean_token_accuracy": 0.39066845178604126, + "num_tokens": 4087377091.0, + "step": 7996 + }, + { + "epoch": 2.162520281233099, + "grad_norm": 3.296875, + "learning_rate": 0.013377936544295942, + "loss": 3.3209, + "mean_token_accuracy": 0.36963951587677, + "num_tokens": 4087901168.0, + "step": 7997 + }, + { + "epoch": 2.1627906976744184, + "grad_norm": 2.46875, + "learning_rate": 0.013376416041117946, + "loss": 3.073, + "mean_token_accuracy": 0.4207912087440491, + "num_tokens": 4088421410.0, + "step": 7998 + }, + { + "epoch": 2.163061114115738, + "grad_norm": 3.0625, + "learning_rate": 0.013374895465024517, + "loss": 3.1028, + "mean_token_accuracy": 0.38833266496658325, + "num_tokens": 4088945407.0, + "step": 7999 + }, + { + "epoch": 2.1633315305570577, + "grad_norm": 2.5625, + "learning_rate": 0.013373374816062312, + "loss": 3.0151, + "mean_token_accuracy": 0.39656156301498413, + "num_tokens": 4089469417.0, + "step": 8000 + }, + { + "epoch": 2.1636019469983774, + "grad_norm": 2.984375, + "learning_rate": 0.013371854094277989, + "loss": 3.0689, + "mean_token_accuracy": 0.40294501185417175, + "num_tokens": 4089948750.0, + "step": 8001 + }, + { + "epoch": 2.163872363439697, + "grad_norm": 3.0625, + "learning_rate": 0.013370333299718205, + "loss": 3.0549, + "mean_token_accuracy": 0.41827452182769775, + "num_tokens": 4090467662.0, + "step": 8002 + }, + { + "epoch": 2.1641427798810167, + "grad_norm": 2.703125, + "learning_rate": 0.013368812432429628, + "loss": 3.1253, + "mean_token_accuracy": 0.3868546783924103, + "num_tokens": 4090991925.0, + "step": 8003 + }, + { + "epoch": 2.1644131963223363, + "grad_norm": 2.4375, + "learning_rate": 0.013367291492458916, + "loss": 2.972, + "mean_token_accuracy": 0.4207267761230469, + "num_tokens": 4091516116.0, + "step": 8004 + }, + { + "epoch": 2.164683612763656, + "grad_norm": 2.453125, + "learning_rate": 0.013365770479852742, + "loss": 3.2321, + "mean_token_accuracy": 0.39399483799934387, + "num_tokens": 4092040277.0, + "step": 8005 + }, + { + "epoch": 2.1649540292049756, + "grad_norm": 2.8125, + "learning_rate": 0.013364249394657768, + "loss": 3.2868, + "mean_token_accuracy": 0.3732317388057709, + "num_tokens": 4092564415.0, + "step": 8006 + }, + { + "epoch": 2.1652244456462952, + "grad_norm": 2.421875, + "learning_rate": 0.013362728236920673, + "loss": 3.4109, + "mean_token_accuracy": 0.3900836110115051, + "num_tokens": 4093041609.0, + "step": 8007 + }, + { + "epoch": 2.165494862087615, + "grad_norm": 2.921875, + "learning_rate": 0.013361207006688127, + "loss": 3.2076, + "mean_token_accuracy": 0.3839126229286194, + "num_tokens": 4093565812.0, + "step": 8008 + }, + { + "epoch": 2.1657652785289345, + "grad_norm": 2.640625, + "learning_rate": 0.013359685704006806, + "loss": 3.207, + "mean_token_accuracy": 0.37920016050338745, + "num_tokens": 4094089928.0, + "step": 8009 + }, + { + "epoch": 2.166035694970254, + "grad_norm": 2.9375, + "learning_rate": 0.01335816432892339, + "loss": 3.1225, + "mean_token_accuracy": 0.38579437136650085, + "num_tokens": 4094614192.0, + "step": 8010 + }, + { + "epoch": 2.166306111411574, + "grad_norm": 10.75, + "learning_rate": 0.013356642881484556, + "loss": 10.9301, + "mean_token_accuracy": 0.007340649608522654, + "num_tokens": 4095138450.0, + "step": 8011 + }, + { + "epoch": 2.1665765278528935, + "grad_norm": 5.40625, + "learning_rate": 0.013355121361736983, + "loss": 3.3822, + "mean_token_accuracy": 0.37204062938690186, + "num_tokens": 4095651001.0, + "step": 8012 + }, + { + "epoch": 2.166846944294213, + "grad_norm": 2.03125, + "learning_rate": 0.013353599769727371, + "loss": 2.9952, + "mean_token_accuracy": 0.40696480870246887, + "num_tokens": 4096175098.0, + "step": 8013 + }, + { + "epoch": 2.1671173607355327, + "grad_norm": 2.703125, + "learning_rate": 0.01335207810550239, + "loss": 3.015, + "mean_token_accuracy": 0.3913821578025818, + "num_tokens": 4096668720.0, + "step": 8014 + }, + { + "epoch": 2.1673877771768524, + "grad_norm": 2.578125, + "learning_rate": 0.013350556369108733, + "loss": 3.1786, + "mean_token_accuracy": 0.3776131868362427, + "num_tokens": 4097171074.0, + "step": 8015 + }, + { + "epoch": 2.167658193618172, + "grad_norm": 3.0, + "learning_rate": 0.013349034560593099, + "loss": 3.2715, + "mean_token_accuracy": 0.37369686365127563, + "num_tokens": 4097695333.0, + "step": 8016 + }, + { + "epoch": 2.1679286100594917, + "grad_norm": 3.109375, + "learning_rate": 0.013347512680002177, + "loss": 3.2266, + "mean_token_accuracy": 0.3917592465877533, + "num_tokens": 4098219609.0, + "step": 8017 + }, + { + "epoch": 2.1681990265008113, + "grad_norm": 3.375, + "learning_rate": 0.01334599072738266, + "loss": 3.3858, + "mean_token_accuracy": 0.37375229597091675, + "num_tokens": 4098739399.0, + "step": 8018 + }, + { + "epoch": 2.168469442942131, + "grad_norm": 3.046875, + "learning_rate": 0.01334446870278125, + "loss": 3.0976, + "mean_token_accuracy": 0.41108012199401855, + "num_tokens": 4099211005.0, + "step": 8019 + }, + { + "epoch": 2.1687398593834506, + "grad_norm": 3.5, + "learning_rate": 0.013342946606244646, + "loss": 3.148, + "mean_token_accuracy": 0.4055129289627075, + "num_tokens": 4099715725.0, + "step": 8020 + }, + { + "epoch": 2.1690102758247702, + "grad_norm": 2.1875, + "learning_rate": 0.013341424437819547, + "loss": 3.1811, + "mean_token_accuracy": 0.408992201089859, + "num_tokens": 4100239794.0, + "step": 8021 + }, + { + "epoch": 2.16928069226609, + "grad_norm": 2.640625, + "learning_rate": 0.013339902197552662, + "loss": 3.228, + "mean_token_accuracy": 0.4028210937976837, + "num_tokens": 4100706947.0, + "step": 8022 + }, + { + "epoch": 2.1695511087074095, + "grad_norm": 2.640625, + "learning_rate": 0.013338379885490697, + "loss": 3.1953, + "mean_token_accuracy": 0.3919799029827118, + "num_tokens": 4101231228.0, + "step": 8023 + }, + { + "epoch": 2.169821525148729, + "grad_norm": 3.140625, + "learning_rate": 0.01333685750168036, + "loss": 3.1895, + "mean_token_accuracy": 0.37432894110679626, + "num_tokens": 4101755272.0, + "step": 8024 + }, + { + "epoch": 2.170091941590049, + "grad_norm": 3.125, + "learning_rate": 0.01333533504616836, + "loss": 3.2753, + "mean_token_accuracy": 0.4080350995063782, + "num_tokens": 4102241709.0, + "step": 8025 + }, + { + "epoch": 2.1703623580313685, + "grad_norm": 3.3125, + "learning_rate": 0.013333812519001419, + "loss": 2.9525, + "mean_token_accuracy": 0.4120560884475708, + "num_tokens": 4102741030.0, + "step": 8026 + }, + { + "epoch": 2.170632774472688, + "grad_norm": 2.546875, + "learning_rate": 0.013332289920226241, + "loss": 3.2037, + "mean_token_accuracy": 0.42534905672073364, + "num_tokens": 4103256399.0, + "step": 8027 + }, + { + "epoch": 2.1709031909140077, + "grad_norm": 3.390625, + "learning_rate": 0.013330767249889552, + "loss": 3.3142, + "mean_token_accuracy": 0.3851977586746216, + "num_tokens": 4103780662.0, + "step": 8028 + }, + { + "epoch": 2.1711736073553274, + "grad_norm": 3.640625, + "learning_rate": 0.01332924450803807, + "loss": 3.1026, + "mean_token_accuracy": 0.40321359038352966, + "num_tokens": 4104288553.0, + "step": 8029 + }, + { + "epoch": 2.171444023796647, + "grad_norm": 3.234375, + "learning_rate": 0.013327721694718516, + "loss": 3.3188, + "mean_token_accuracy": 0.3774920701980591, + "num_tokens": 4104812828.0, + "step": 8030 + }, + { + "epoch": 2.1717144402379667, + "grad_norm": 75.5, + "learning_rate": 0.013326198809977612, + "loss": 10.8341, + "mean_token_accuracy": 0.007509286515414715, + "num_tokens": 4105337080.0, + "step": 8031 + }, + { + "epoch": 2.171984856679286, + "grad_norm": 7.1875, + "learning_rate": 0.013324675853862092, + "loss": 3.4908, + "mean_token_accuracy": 0.34091290831565857, + "num_tokens": 4105861160.0, + "step": 8032 + }, + { + "epoch": 2.172255273120606, + "grad_norm": 2.46875, + "learning_rate": 0.013323152826418678, + "loss": 3.4978, + "mean_token_accuracy": 0.37317413091659546, + "num_tokens": 4106345034.0, + "step": 8033 + }, + { + "epoch": 2.172525689561925, + "grad_norm": 2.328125, + "learning_rate": 0.013321629727694106, + "loss": 3.0722, + "mean_token_accuracy": 0.40275728702545166, + "num_tokens": 4106869190.0, + "step": 8034 + }, + { + "epoch": 2.172796106003245, + "grad_norm": 2.453125, + "learning_rate": 0.013320106557735103, + "loss": 3.2668, + "mean_token_accuracy": 0.38741356134414673, + "num_tokens": 4107393413.0, + "step": 8035 + }, + { + "epoch": 2.1730665224445644, + "grad_norm": 3.0, + "learning_rate": 0.01331858331658841, + "loss": 3.1478, + "mean_token_accuracy": 0.4558185338973999, + "num_tokens": 4107852709.0, + "step": 8036 + }, + { + "epoch": 2.173336938885884, + "grad_norm": 3.15625, + "learning_rate": 0.01331706000430076, + "loss": 3.1627, + "mean_token_accuracy": 0.41139620542526245, + "num_tokens": 4108314295.0, + "step": 8037 + }, + { + "epoch": 2.1736073553272037, + "grad_norm": 2.375, + "learning_rate": 0.0133155366209189, + "loss": 3.079, + "mean_token_accuracy": 0.42613738775253296, + "num_tokens": 4108773706.0, + "step": 8038 + }, + { + "epoch": 2.1738777717685234, + "grad_norm": 2.71875, + "learning_rate": 0.01331401316648956, + "loss": 3.1935, + "mean_token_accuracy": 0.3808252215385437, + "num_tokens": 4109297849.0, + "step": 8039 + }, + { + "epoch": 2.174148188209843, + "grad_norm": 2.828125, + "learning_rate": 0.013312489641059497, + "loss": 3.2472, + "mean_token_accuracy": 0.41107243299484253, + "num_tokens": 4109807634.0, + "step": 8040 + }, + { + "epoch": 2.1744186046511627, + "grad_norm": 3.15625, + "learning_rate": 0.013310966044675448, + "loss": 3.2844, + "mean_token_accuracy": 0.36482852697372437, + "num_tokens": 4110331866.0, + "step": 8041 + }, + { + "epoch": 2.1746890210924823, + "grad_norm": 2.515625, + "learning_rate": 0.013309442377384165, + "loss": 3.2465, + "mean_token_accuracy": 0.38022148609161377, + "num_tokens": 4110856140.0, + "step": 8042 + }, + { + "epoch": 2.174959437533802, + "grad_norm": 2.53125, + "learning_rate": 0.0133079186392324, + "loss": 3.0463, + "mean_token_accuracy": 0.4119643568992615, + "num_tokens": 4111380419.0, + "step": 8043 + }, + { + "epoch": 2.1752298539751216, + "grad_norm": 3.0625, + "learning_rate": 0.013306394830266901, + "loss": 3.3496, + "mean_token_accuracy": 0.3824022114276886, + "num_tokens": 4111825531.0, + "step": 8044 + }, + { + "epoch": 2.1755002704164412, + "grad_norm": 2.890625, + "learning_rate": 0.013304870950534427, + "loss": 3.2959, + "mean_token_accuracy": 0.39936262369155884, + "num_tokens": 4112291094.0, + "step": 8045 + }, + { + "epoch": 2.175770686857761, + "grad_norm": 3.109375, + "learning_rate": 0.013303347000081739, + "loss": 3.1873, + "mean_token_accuracy": 0.3935481905937195, + "num_tokens": 4112815366.0, + "step": 8046 + }, + { + "epoch": 2.1760411032990805, + "grad_norm": 6.8125, + "learning_rate": 0.013301822978955586, + "loss": 2.8302, + "mean_token_accuracy": 0.43531423807144165, + "num_tokens": 4113339559.0, + "step": 8047 + }, + { + "epoch": 2.1763115197404, + "grad_norm": 2.359375, + "learning_rate": 0.013300298887202735, + "loss": 3.0052, + "mean_token_accuracy": 0.44336625933647156, + "num_tokens": 4113863792.0, + "step": 8048 + }, + { + "epoch": 2.17658193618172, + "grad_norm": 3.359375, + "learning_rate": 0.013298774724869952, + "loss": 3.0581, + "mean_token_accuracy": 0.41100141406059265, + "num_tokens": 4114388067.0, + "step": 8049 + }, + { + "epoch": 2.1768523526230394, + "grad_norm": 4.78125, + "learning_rate": 0.013297250492003997, + "loss": 3.3361, + "mean_token_accuracy": 0.3986768424510956, + "num_tokens": 4114912234.0, + "step": 8050 + }, + { + "epoch": 2.177122769064359, + "grad_norm": 53.75, + "learning_rate": 0.01329572618865164, + "loss": 11.4729, + "mean_token_accuracy": 0.007620805408805609, + "num_tokens": 4115436504.0, + "step": 8051 + }, + { + "epoch": 2.1773931855056787, + "grad_norm": 8.25, + "learning_rate": 0.013294201814859657, + "loss": 3.68, + "mean_token_accuracy": 0.3300107419490814, + "num_tokens": 4115960770.0, + "step": 8052 + }, + { + "epoch": 2.1776636019469984, + "grad_norm": 1.9609375, + "learning_rate": 0.013292677370674815, + "loss": 3.2685, + "mean_token_accuracy": 0.38278818130493164, + "num_tokens": 4116485022.0, + "step": 8053 + }, + { + "epoch": 2.177934018388318, + "grad_norm": 3.390625, + "learning_rate": 0.013291152856143884, + "loss": 3.1769, + "mean_token_accuracy": 0.3843834400177002, + "num_tokens": 4117009092.0, + "step": 8054 + }, + { + "epoch": 2.1782044348296377, + "grad_norm": 3.921875, + "learning_rate": 0.01328962827131365, + "loss": 3.1448, + "mean_token_accuracy": 0.39529481530189514, + "num_tokens": 4117432532.0, + "step": 8055 + }, + { + "epoch": 2.1784748512709573, + "grad_norm": 3.078125, + "learning_rate": 0.013288103616230888, + "loss": 3.1971, + "mean_token_accuracy": 0.38515618443489075, + "num_tokens": 4117956796.0, + "step": 8056 + }, + { + "epoch": 2.178745267712277, + "grad_norm": 3.875, + "learning_rate": 0.013286578890942376, + "loss": 3.1699, + "mean_token_accuracy": 0.3986591100692749, + "num_tokens": 4118480973.0, + "step": 8057 + }, + { + "epoch": 2.1790156841535966, + "grad_norm": 3.40625, + "learning_rate": 0.013285054095494898, + "loss": 3.4346, + "mean_token_accuracy": 0.368866890668869, + "num_tokens": 4118969863.0, + "step": 8058 + }, + { + "epoch": 2.1792861005949162, + "grad_norm": 3.234375, + "learning_rate": 0.013283529229935243, + "loss": 3.2024, + "mean_token_accuracy": 0.38223177194595337, + "num_tokens": 4119451704.0, + "step": 8059 + }, + { + "epoch": 2.179556517036236, + "grad_norm": 3.0625, + "learning_rate": 0.013282004294310193, + "loss": 3.1393, + "mean_token_accuracy": 0.3915310502052307, + "num_tokens": 4119975974.0, + "step": 8060 + }, + { + "epoch": 2.1798269334775555, + "grad_norm": 2.65625, + "learning_rate": 0.013280479288666545, + "loss": 3.0064, + "mean_token_accuracy": 0.4098760485649109, + "num_tokens": 4120462377.0, + "step": 8061 + }, + { + "epoch": 2.180097349918875, + "grad_norm": 2.5, + "learning_rate": 0.01327895421305108, + "loss": 3.1428, + "mean_token_accuracy": 0.4025510549545288, + "num_tokens": 4120986619.0, + "step": 8062 + }, + { + "epoch": 2.180367766360195, + "grad_norm": 2.9375, + "learning_rate": 0.013277429067510602, + "loss": 3.2342, + "mean_token_accuracy": 0.3899003863334656, + "num_tokens": 4121503456.0, + "step": 8063 + }, + { + "epoch": 2.1806381828015144, + "grad_norm": 2.984375, + "learning_rate": 0.013275903852091904, + "loss": 3.3036, + "mean_token_accuracy": 0.39886438846588135, + "num_tokens": 4122027649.0, + "step": 8064 + }, + { + "epoch": 2.180908599242834, + "grad_norm": 2.796875, + "learning_rate": 0.013274378566841778, + "loss": 2.9951, + "mean_token_accuracy": 0.39017951488494873, + "num_tokens": 4122551861.0, + "step": 8065 + }, + { + "epoch": 2.1811790156841537, + "grad_norm": 2.796875, + "learning_rate": 0.01327285321180703, + "loss": 3.1081, + "mean_token_accuracy": 0.39759308099746704, + "num_tokens": 4123075991.0, + "step": 8066 + }, + { + "epoch": 2.1814494321254734, + "grad_norm": 2.578125, + "learning_rate": 0.013271327787034467, + "loss": 3.1565, + "mean_token_accuracy": 0.40445876121520996, + "num_tokens": 4123600214.0, + "step": 8067 + }, + { + "epoch": 2.181719848566793, + "grad_norm": 2.96875, + "learning_rate": 0.013269802292570881, + "loss": 3.0161, + "mean_token_accuracy": 0.40729033946990967, + "num_tokens": 4124124494.0, + "step": 8068 + }, + { + "epoch": 2.1819902650081127, + "grad_norm": 2.78125, + "learning_rate": 0.013268276728463091, + "loss": 3.0216, + "mean_token_accuracy": 0.3792648911476135, + "num_tokens": 4124648761.0, + "step": 8069 + }, + { + "epoch": 2.1822606814494323, + "grad_norm": 3.15625, + "learning_rate": 0.013266751094757896, + "loss": 3.1818, + "mean_token_accuracy": 0.39613696932792664, + "num_tokens": 4125172916.0, + "step": 8070 + }, + { + "epoch": 2.182531097890752, + "grad_norm": 69.0, + "learning_rate": 0.013265225391502114, + "loss": 11.9751, + "mean_token_accuracy": 0.002229427918791771, + "num_tokens": 4125687216.0, + "step": 8071 + }, + { + "epoch": 2.1828015143320716, + "grad_norm": 6.5, + "learning_rate": 0.013263699618742552, + "loss": 3.7538, + "mean_token_accuracy": 0.333723247051239, + "num_tokens": 4126211349.0, + "step": 8072 + }, + { + "epoch": 2.183071930773391, + "grad_norm": 2.171875, + "learning_rate": 0.013262173776526031, + "loss": 3.2109, + "mean_token_accuracy": 0.39643698930740356, + "num_tokens": 4126735634.0, + "step": 8073 + }, + { + "epoch": 2.183342347214711, + "grad_norm": 3.03125, + "learning_rate": 0.013260647864899366, + "loss": 3.153, + "mean_token_accuracy": 0.3961973190307617, + "num_tokens": 4127259914.0, + "step": 8074 + }, + { + "epoch": 2.18361276365603, + "grad_norm": 3.21875, + "learning_rate": 0.013259121883909377, + "loss": 3.3027, + "mean_token_accuracy": 0.3955792188644409, + "num_tokens": 4127784034.0, + "step": 8075 + }, + { + "epoch": 2.1838831800973497, + "grad_norm": 2.46875, + "learning_rate": 0.013257595833602887, + "loss": 3.085, + "mean_token_accuracy": 0.4024348855018616, + "num_tokens": 4128308304.0, + "step": 8076 + }, + { + "epoch": 2.1841535965386694, + "grad_norm": 3.03125, + "learning_rate": 0.013256069714026713, + "loss": 3.3331, + "mean_token_accuracy": 0.3869505524635315, + "num_tokens": 4128832510.0, + "step": 8077 + }, + { + "epoch": 2.184424012979989, + "grad_norm": 2.578125, + "learning_rate": 0.013254543525227687, + "loss": 3.0968, + "mean_token_accuracy": 0.4089614748954773, + "num_tokens": 4129350707.0, + "step": 8078 + }, + { + "epoch": 2.1846944294213086, + "grad_norm": 2.96875, + "learning_rate": 0.013253017267252636, + "loss": 3.2144, + "mean_token_accuracy": 0.4088580906391144, + "num_tokens": 4129874907.0, + "step": 8079 + }, + { + "epoch": 2.1849648458626283, + "grad_norm": 3.453125, + "learning_rate": 0.01325149094014839, + "loss": 3.2979, + "mean_token_accuracy": 0.39064422249794006, + "num_tokens": 4130375780.0, + "step": 8080 + }, + { + "epoch": 2.185235262303948, + "grad_norm": 2.859375, + "learning_rate": 0.01324996454396178, + "loss": 3.1315, + "mean_token_accuracy": 0.4016217589378357, + "num_tokens": 4130900008.0, + "step": 8081 + }, + { + "epoch": 2.1855056787452676, + "grad_norm": 3.84375, + "learning_rate": 0.013248438078739645, + "loss": 3.023, + "mean_token_accuracy": 0.4237717390060425, + "num_tokens": 4131424277.0, + "step": 8082 + }, + { + "epoch": 2.185776095186587, + "grad_norm": 2.359375, + "learning_rate": 0.013246911544528811, + "loss": 3.0383, + "mean_token_accuracy": 0.4062085747718811, + "num_tokens": 4131948533.0, + "step": 8083 + }, + { + "epoch": 2.186046511627907, + "grad_norm": 3.8125, + "learning_rate": 0.013245384941376127, + "loss": 3.2873, + "mean_token_accuracy": 0.3876161575317383, + "num_tokens": 4132472735.0, + "step": 8084 + }, + { + "epoch": 2.1863169280692265, + "grad_norm": 3.40625, + "learning_rate": 0.013243858269328428, + "loss": 3.0813, + "mean_token_accuracy": 0.3855814039707184, + "num_tokens": 4132996806.0, + "step": 8085 + }, + { + "epoch": 2.186587344510546, + "grad_norm": 3.28125, + "learning_rate": 0.013242331528432558, + "loss": 3.2168, + "mean_token_accuracy": 0.38762229681015015, + "num_tokens": 4133520982.0, + "step": 8086 + }, + { + "epoch": 2.186857760951866, + "grad_norm": 3.015625, + "learning_rate": 0.013240804718735365, + "loss": 3.299, + "mean_token_accuracy": 0.3844684958457947, + "num_tokens": 4134042214.0, + "step": 8087 + }, + { + "epoch": 2.1871281773931854, + "grad_norm": 2.5, + "learning_rate": 0.01323927784028369, + "loss": 2.9823, + "mean_token_accuracy": 0.4026850759983063, + "num_tokens": 4134566289.0, + "step": 8088 + }, + { + "epoch": 2.187398593834505, + "grad_norm": 2.734375, + "learning_rate": 0.01323775089312439, + "loss": 3.3133, + "mean_token_accuracy": 0.3966282308101654, + "num_tokens": 4135090538.0, + "step": 8089 + }, + { + "epoch": 2.1876690102758247, + "grad_norm": 3.359375, + "learning_rate": 0.01323622387730431, + "loss": 3.3053, + "mean_token_accuracy": 0.37973296642303467, + "num_tokens": 4135614451.0, + "step": 8090 + }, + { + "epoch": 2.1879394267171444, + "grad_norm": 58.75, + "learning_rate": 0.013234696792870305, + "loss": 11.6724, + "mean_token_accuracy": 0.0053183287382125854, + "num_tokens": 4136138708.0, + "step": 8091 + }, + { + "epoch": 2.188209843158464, + "grad_norm": 8.25, + "learning_rate": 0.013233169639869232, + "loss": 3.5956, + "mean_token_accuracy": 0.3433414697647095, + "num_tokens": 4136618219.0, + "step": 8092 + }, + { + "epoch": 2.1884802595997837, + "grad_norm": 2.28125, + "learning_rate": 0.013231642418347941, + "loss": 3.1493, + "mean_token_accuracy": 0.39423906803131104, + "num_tokens": 4137142427.0, + "step": 8093 + }, + { + "epoch": 2.1887506760411033, + "grad_norm": 3.15625, + "learning_rate": 0.0132301151283533, + "loss": 3.2066, + "mean_token_accuracy": 0.37831616401672363, + "num_tokens": 4137666616.0, + "step": 8094 + }, + { + "epoch": 2.189021092482423, + "grad_norm": 2.953125, + "learning_rate": 0.013228587769932174, + "loss": 3.1753, + "mean_token_accuracy": 0.3950234353542328, + "num_tokens": 4138190715.0, + "step": 8095 + }, + { + "epoch": 2.1892915089237426, + "grad_norm": 3.03125, + "learning_rate": 0.013227060343131418, + "loss": 3.2276, + "mean_token_accuracy": 0.36449724435806274, + "num_tokens": 4138714954.0, + "step": 8096 + }, + { + "epoch": 2.1895619253650622, + "grad_norm": 2.65625, + "learning_rate": 0.013225532847997901, + "loss": 3.1084, + "mean_token_accuracy": 0.4068652391433716, + "num_tokens": 4139239129.0, + "step": 8097 + }, + { + "epoch": 2.189832341806382, + "grad_norm": 2.78125, + "learning_rate": 0.013224005284578493, + "loss": 3.0488, + "mean_token_accuracy": 0.36526602506637573, + "num_tokens": 4139763235.0, + "step": 8098 + }, + { + "epoch": 2.1901027582477015, + "grad_norm": 2.421875, + "learning_rate": 0.013222477652920062, + "loss": 2.9644, + "mean_token_accuracy": 0.4132576882839203, + "num_tokens": 4140243848.0, + "step": 8099 + }, + { + "epoch": 2.190373174689021, + "grad_norm": 3.171875, + "learning_rate": 0.01322094995306948, + "loss": 3.1078, + "mean_token_accuracy": 0.3733806610107422, + "num_tokens": 4140748986.0, + "step": 8100 + }, + { + "epoch": 2.190643591130341, + "grad_norm": 2.328125, + "learning_rate": 0.01321942218507362, + "loss": 3.1316, + "mean_token_accuracy": 0.40320903062820435, + "num_tokens": 4141273216.0, + "step": 8101 + }, + { + "epoch": 2.1909140075716604, + "grad_norm": 2.875, + "learning_rate": 0.013217894348979365, + "loss": 3.1702, + "mean_token_accuracy": 0.412309467792511, + "num_tokens": 4141797457.0, + "step": 8102 + }, + { + "epoch": 2.19118442401298, + "grad_norm": 2.375, + "learning_rate": 0.013216366444833585, + "loss": 3.1495, + "mean_token_accuracy": 0.41196972131729126, + "num_tokens": 4142321682.0, + "step": 8103 + }, + { + "epoch": 2.1914548404542997, + "grad_norm": 2.609375, + "learning_rate": 0.013214838472683163, + "loss": 3.2352, + "mean_token_accuracy": 0.3917580544948578, + "num_tokens": 4142845766.0, + "step": 8104 + }, + { + "epoch": 2.1917252568956194, + "grad_norm": 3.109375, + "learning_rate": 0.01321331043257499, + "loss": 3.279, + "mean_token_accuracy": 0.38710087537765503, + "num_tokens": 4143369957.0, + "step": 8105 + }, + { + "epoch": 2.191995673336939, + "grad_norm": 39.5, + "learning_rate": 0.01321178232455594, + "loss": 3.2946, + "mean_token_accuracy": 0.38772493600845337, + "num_tokens": 4143894181.0, + "step": 8106 + }, + { + "epoch": 2.1922660897782587, + "grad_norm": 4.0, + "learning_rate": 0.013210254148672905, + "loss": 3.2986, + "mean_token_accuracy": 0.36951595544815063, + "num_tokens": 4144418449.0, + "step": 8107 + }, + { + "epoch": 2.1925365062195783, + "grad_norm": 2.6875, + "learning_rate": 0.01320872590497277, + "loss": 3.1425, + "mean_token_accuracy": 0.39150530099868774, + "num_tokens": 4144942732.0, + "step": 8108 + }, + { + "epoch": 2.192806922660898, + "grad_norm": 2.9375, + "learning_rate": 0.013207197593502432, + "loss": 3.1687, + "mean_token_accuracy": 0.39411240816116333, + "num_tokens": 4145467001.0, + "step": 8109 + }, + { + "epoch": 2.1930773391022176, + "grad_norm": 3.421875, + "learning_rate": 0.013205669214308777, + "loss": 3.4285, + "mean_token_accuracy": 0.3751699924468994, + "num_tokens": 4145991257.0, + "step": 8110 + }, + { + "epoch": 2.1933477555435372, + "grad_norm": 80.5, + "learning_rate": 0.013204140767438708, + "loss": 11.7779, + "mean_token_accuracy": 0.0015620558988302946, + "num_tokens": 4146446772.0, + "step": 8111 + }, + { + "epoch": 2.193618171984857, + "grad_norm": 6.96875, + "learning_rate": 0.013202612252939118, + "loss": 3.6332, + "mean_token_accuracy": 0.3533128798007965, + "num_tokens": 4146971050.0, + "step": 8112 + }, + { + "epoch": 2.1938885884261765, + "grad_norm": 2.109375, + "learning_rate": 0.013201083670856905, + "loss": 3.1682, + "mean_token_accuracy": 0.3940994143486023, + "num_tokens": 4147495315.0, + "step": 8113 + }, + { + "epoch": 2.1941590048674957, + "grad_norm": 1.8671875, + "learning_rate": 0.01319955502123897, + "loss": 3.1246, + "mean_token_accuracy": 0.4107385277748108, + "num_tokens": 4147966162.0, + "step": 8114 + }, + { + "epoch": 2.194429421308816, + "grad_norm": 3.0625, + "learning_rate": 0.013198026304132224, + "loss": 3.4207, + "mean_token_accuracy": 0.38229620456695557, + "num_tokens": 4148469369.0, + "step": 8115 + }, + { + "epoch": 2.194699837750135, + "grad_norm": 3.03125, + "learning_rate": 0.013196497519583563, + "loss": 3.1907, + "mean_token_accuracy": 0.3898216187953949, + "num_tokens": 4148993627.0, + "step": 8116 + }, + { + "epoch": 2.1949702541914546, + "grad_norm": 3.28125, + "learning_rate": 0.0131949686676399, + "loss": 3.1386, + "mean_token_accuracy": 0.40287700295448303, + "num_tokens": 4149517768.0, + "step": 8117 + }, + { + "epoch": 2.1952406706327743, + "grad_norm": 3.390625, + "learning_rate": 0.013193439748348143, + "loss": 3.1852, + "mean_token_accuracy": 0.3760238289833069, + "num_tokens": 4150041992.0, + "step": 8118 + }, + { + "epoch": 2.195511087074094, + "grad_norm": 2.609375, + "learning_rate": 0.013191910761755203, + "loss": 3.1429, + "mean_token_accuracy": 0.389930784702301, + "num_tokens": 4150566223.0, + "step": 8119 + }, + { + "epoch": 2.1957815035154136, + "grad_norm": 3.1875, + "learning_rate": 0.013190381707907997, + "loss": 2.917, + "mean_token_accuracy": 0.40960848331451416, + "num_tokens": 4151090504.0, + "step": 8120 + }, + { + "epoch": 2.196051919956733, + "grad_norm": 3.09375, + "learning_rate": 0.013188852586853437, + "loss": 3.1981, + "mean_token_accuracy": 0.37062275409698486, + "num_tokens": 4151603833.0, + "step": 8121 + }, + { + "epoch": 2.196322336398053, + "grad_norm": 3.546875, + "learning_rate": 0.013187323398638444, + "loss": 3.3389, + "mean_token_accuracy": 0.3703831136226654, + "num_tokens": 4152128100.0, + "step": 8122 + }, + { + "epoch": 2.1965927528393725, + "grad_norm": 3.078125, + "learning_rate": 0.013185794143309934, + "loss": 3.3148, + "mean_token_accuracy": 0.3961184322834015, + "num_tokens": 4152597481.0, + "step": 8123 + }, + { + "epoch": 2.196863169280692, + "grad_norm": 3.046875, + "learning_rate": 0.013184264820914835, + "loss": 3.0224, + "mean_token_accuracy": 0.4105163514614105, + "num_tokens": 4153118997.0, + "step": 8124 + }, + { + "epoch": 2.197133585722012, + "grad_norm": 2.671875, + "learning_rate": 0.013182735431500066, + "loss": 3.1213, + "mean_token_accuracy": 0.40644338726997375, + "num_tokens": 4153643229.0, + "step": 8125 + }, + { + "epoch": 2.1974040021633314, + "grad_norm": 2.84375, + "learning_rate": 0.013181205975112556, + "loss": 3.05, + "mean_token_accuracy": 0.4011719226837158, + "num_tokens": 4154167337.0, + "step": 8126 + }, + { + "epoch": 2.197674418604651, + "grad_norm": 2.421875, + "learning_rate": 0.01317967645179923, + "loss": 3.146, + "mean_token_accuracy": 0.40610092878341675, + "num_tokens": 4154691587.0, + "step": 8127 + }, + { + "epoch": 2.1979448350459707, + "grad_norm": 3.625, + "learning_rate": 0.013178146861607022, + "loss": 3.3557, + "mean_token_accuracy": 0.37757501006126404, + "num_tokens": 4155215778.0, + "step": 8128 + }, + { + "epoch": 2.1982152514872904, + "grad_norm": 2.9375, + "learning_rate": 0.013176617204582864, + "loss": 3.2519, + "mean_token_accuracy": 0.39388519525527954, + "num_tokens": 4155695339.0, + "step": 8129 + }, + { + "epoch": 2.19848566792861, + "grad_norm": 3.359375, + "learning_rate": 0.013175087480773687, + "loss": 3.2015, + "mean_token_accuracy": 0.38477492332458496, + "num_tokens": 4156217686.0, + "step": 8130 + }, + { + "epoch": 2.1987560843699296, + "grad_norm": 7.6875, + "learning_rate": 0.013173557690226427, + "loss": 10.324, + "mean_token_accuracy": 0.00039443233981728554, + "num_tokens": 4156741913.0, + "step": 8131 + }, + { + "epoch": 2.1990265008112493, + "grad_norm": 7.3125, + "learning_rate": 0.013172027832988034, + "loss": 3.2908, + "mean_token_accuracy": 0.37787917256355286, + "num_tokens": 4157236372.0, + "step": 8132 + }, + { + "epoch": 2.199296917252569, + "grad_norm": 3.0, + "learning_rate": 0.013170497909105433, + "loss": 3.2442, + "mean_token_accuracy": 0.40183696150779724, + "num_tokens": 4157760541.0, + "step": 8133 + }, + { + "epoch": 2.1995673336938886, + "grad_norm": 2.453125, + "learning_rate": 0.013168967918625572, + "loss": 3.0067, + "mean_token_accuracy": 0.40279421210289, + "num_tokens": 4158284818.0, + "step": 8134 + }, + { + "epoch": 2.199837750135208, + "grad_norm": 2.84375, + "learning_rate": 0.0131674378615954, + "loss": 3.2985, + "mean_token_accuracy": 0.385780394077301, + "num_tokens": 4158808999.0, + "step": 8135 + }, + { + "epoch": 2.200108166576528, + "grad_norm": 3.015625, + "learning_rate": 0.013165907738061857, + "loss": 3.2196, + "mean_token_accuracy": 0.40019676089286804, + "num_tokens": 4159309346.0, + "step": 8136 + }, + { + "epoch": 2.2003785830178475, + "grad_norm": 2.25, + "learning_rate": 0.013164377548071898, + "loss": 3.1397, + "mean_token_accuracy": 0.3988310694694519, + "num_tokens": 4159795804.0, + "step": 8137 + }, + { + "epoch": 2.200648999459167, + "grad_norm": 3.125, + "learning_rate": 0.013162847291672473, + "loss": 3.1037, + "mean_token_accuracy": 0.3943685293197632, + "num_tokens": 4160319963.0, + "step": 8138 + }, + { + "epoch": 2.200919415900487, + "grad_norm": 4.28125, + "learning_rate": 0.013161316968910527, + "loss": 3.3262, + "mean_token_accuracy": 0.39245712757110596, + "num_tokens": 4160813714.0, + "step": 8139 + }, + { + "epoch": 2.2011898323418064, + "grad_norm": 3.125, + "learning_rate": 0.013159786579833023, + "loss": 2.8964, + "mean_token_accuracy": 0.4416564702987671, + "num_tokens": 4161337956.0, + "step": 8140 + }, + { + "epoch": 2.201460248783126, + "grad_norm": 3.015625, + "learning_rate": 0.013158256124486917, + "loss": 2.9107, + "mean_token_accuracy": 0.41936057806015015, + "num_tokens": 4161862126.0, + "step": 8141 + }, + { + "epoch": 2.2017306652244457, + "grad_norm": 3.46875, + "learning_rate": 0.013156725602919167, + "loss": 2.9888, + "mean_token_accuracy": 0.4063671827316284, + "num_tokens": 4162363353.0, + "step": 8142 + }, + { + "epoch": 2.2020010816657654, + "grad_norm": 2.890625, + "learning_rate": 0.013155195015176726, + "loss": 3.0819, + "mean_token_accuracy": 0.3900180459022522, + "num_tokens": 4162887514.0, + "step": 8143 + }, + { + "epoch": 2.202271498107085, + "grad_norm": 2.703125, + "learning_rate": 0.013153664361306567, + "loss": 3.1448, + "mean_token_accuracy": 0.39959245920181274, + "num_tokens": 4163411694.0, + "step": 8144 + }, + { + "epoch": 2.2025419145484046, + "grad_norm": 2.84375, + "learning_rate": 0.013152133641355654, + "loss": 3.086, + "mean_token_accuracy": 0.39172184467315674, + "num_tokens": 4163897153.0, + "step": 8145 + }, + { + "epoch": 2.2028123309897243, + "grad_norm": 3.0, + "learning_rate": 0.013150602855370952, + "loss": 3.3165, + "mean_token_accuracy": 0.38147544860839844, + "num_tokens": 4164421366.0, + "step": 8146 + }, + { + "epoch": 2.203082747431044, + "grad_norm": 3.140625, + "learning_rate": 0.01314907200339943, + "loss": 3.2936, + "mean_token_accuracy": 0.3886881470680237, + "num_tokens": 4164926904.0, + "step": 8147 + }, + { + "epoch": 2.2033531638723636, + "grad_norm": 2.90625, + "learning_rate": 0.013147541085488058, + "loss": 3.0566, + "mean_token_accuracy": 0.41763734817504883, + "num_tokens": 4165451127.0, + "step": 8148 + }, + { + "epoch": 2.203623580313683, + "grad_norm": 2.921875, + "learning_rate": 0.01314601010168381, + "loss": 3.3114, + "mean_token_accuracy": 0.36151331663131714, + "num_tokens": 4165975400.0, + "step": 8149 + }, + { + "epoch": 2.203893996755003, + "grad_norm": 53.5, + "learning_rate": 0.01314447905203366, + "loss": 3.521, + "mean_token_accuracy": 0.35778096318244934, + "num_tokens": 4166499535.0, + "step": 8150 + }, + { + "epoch": 2.2041644131963225, + "grad_norm": 45.0, + "learning_rate": 0.013142947936584588, + "loss": 12.3295, + "mean_token_accuracy": 0.006901225075125694, + "num_tokens": 4167002369.0, + "step": 8151 + }, + { + "epoch": 2.204434829637642, + "grad_norm": 10.0, + "learning_rate": 0.013141416755383568, + "loss": 3.9307, + "mean_token_accuracy": 0.32678842544555664, + "num_tokens": 4167526424.0, + "step": 8152 + }, + { + "epoch": 2.204705246078962, + "grad_norm": 3.203125, + "learning_rate": 0.013139885508477589, + "loss": 3.4, + "mean_token_accuracy": 0.367418497800827, + "num_tokens": 4168050639.0, + "step": 8153 + }, + { + "epoch": 2.2049756625202814, + "grad_norm": 2.5, + "learning_rate": 0.013138354195913629, + "loss": 3.277, + "mean_token_accuracy": 0.36381351947784424, + "num_tokens": 4168574854.0, + "step": 8154 + }, + { + "epoch": 2.2052460789616006, + "grad_norm": 3.625, + "learning_rate": 0.01313682281773867, + "loss": 3.1725, + "mean_token_accuracy": 0.3818289637565613, + "num_tokens": 4169099092.0, + "step": 8155 + }, + { + "epoch": 2.2055164954029207, + "grad_norm": 3.4375, + "learning_rate": 0.013135291373999707, + "loss": 3.2837, + "mean_token_accuracy": 0.38978493213653564, + "num_tokens": 4169582346.0, + "step": 8156 + }, + { + "epoch": 2.20578691184424, + "grad_norm": 3.125, + "learning_rate": 0.013133759864743721, + "loss": 3.3147, + "mean_token_accuracy": 0.40058326721191406, + "num_tokens": 4170044639.0, + "step": 8157 + }, + { + "epoch": 2.2060573282855596, + "grad_norm": 2.765625, + "learning_rate": 0.013132228290017709, + "loss": 3.2166, + "mean_token_accuracy": 0.3689621090888977, + "num_tokens": 4170568895.0, + "step": 8158 + }, + { + "epoch": 2.206327744726879, + "grad_norm": 2.375, + "learning_rate": 0.013130696649868663, + "loss": 3.1446, + "mean_token_accuracy": 0.40188512206077576, + "num_tokens": 4171065798.0, + "step": 8159 + }, + { + "epoch": 2.206598161168199, + "grad_norm": 2.78125, + "learning_rate": 0.01312916494434358, + "loss": 3.1673, + "mean_token_accuracy": 0.42963671684265137, + "num_tokens": 4171525888.0, + "step": 8160 + }, + { + "epoch": 2.2068685776095185, + "grad_norm": 3.3125, + "learning_rate": 0.013127633173489453, + "loss": 3.0859, + "mean_token_accuracy": 0.43453267216682434, + "num_tokens": 4172003068.0, + "step": 8161 + }, + { + "epoch": 2.207138994050838, + "grad_norm": 2.703125, + "learning_rate": 0.013126101337353285, + "loss": 3.0574, + "mean_token_accuracy": 0.40635448694229126, + "num_tokens": 4172527348.0, + "step": 8162 + }, + { + "epoch": 2.2074094104921578, + "grad_norm": 2.546875, + "learning_rate": 0.013124569435982074, + "loss": 3.2232, + "mean_token_accuracy": 0.38145333528518677, + "num_tokens": 4173013851.0, + "step": 8163 + }, + { + "epoch": 2.2076798269334774, + "grad_norm": 3.09375, + "learning_rate": 0.013123037469422824, + "loss": 2.943, + "mean_token_accuracy": 0.45224446058273315, + "num_tokens": 4173528361.0, + "step": 8164 + }, + { + "epoch": 2.207950243374797, + "grad_norm": 3.140625, + "learning_rate": 0.01312150543772254, + "loss": 3.2478, + "mean_token_accuracy": 0.4028111398220062, + "num_tokens": 4174050462.0, + "step": 8165 + }, + { + "epoch": 2.2082206598161167, + "grad_norm": 3.953125, + "learning_rate": 0.01311997334092823, + "loss": 3.3337, + "mean_token_accuracy": 0.36116933822631836, + "num_tokens": 4174574664.0, + "step": 8166 + }, + { + "epoch": 2.2084910762574363, + "grad_norm": 3.109375, + "learning_rate": 0.013118441179086908, + "loss": 3.2933, + "mean_token_accuracy": 0.4071810245513916, + "num_tokens": 4175098869.0, + "step": 8167 + }, + { + "epoch": 2.208761492698756, + "grad_norm": 2.84375, + "learning_rate": 0.01311690895224558, + "loss": 3.1197, + "mean_token_accuracy": 0.36926448345184326, + "num_tokens": 4175622996.0, + "step": 8168 + }, + { + "epoch": 2.2090319091400756, + "grad_norm": 3.234375, + "learning_rate": 0.013115376660451258, + "loss": 3.1864, + "mean_token_accuracy": 0.39958274364471436, + "num_tokens": 4176109483.0, + "step": 8169 + }, + { + "epoch": 2.2093023255813953, + "grad_norm": 2.84375, + "learning_rate": 0.013113844303750956, + "loss": 3.1569, + "mean_token_accuracy": 0.42381036281585693, + "num_tokens": 4176573093.0, + "step": 8170 + }, + { + "epoch": 2.209572742022715, + "grad_norm": 51.75, + "learning_rate": 0.013112311882191698, + "loss": 10.7516, + "mean_token_accuracy": 0.002991750370711088, + "num_tokens": 4177039331.0, + "step": 8171 + }, + { + "epoch": 2.2098431584640346, + "grad_norm": 7.625, + "learning_rate": 0.013110779395820497, + "loss": 3.7701, + "mean_token_accuracy": 0.33537060022354126, + "num_tokens": 4177563460.0, + "step": 8172 + }, + { + "epoch": 2.210113574905354, + "grad_norm": 2.484375, + "learning_rate": 0.013109246844684381, + "loss": 3.0342, + "mean_token_accuracy": 0.42250850796699524, + "num_tokens": 4178087621.0, + "step": 8173 + }, + { + "epoch": 2.210383991346674, + "grad_norm": 1.9765625, + "learning_rate": 0.013107714228830369, + "loss": 3.3413, + "mean_token_accuracy": 0.3969125747680664, + "num_tokens": 4178611845.0, + "step": 8174 + }, + { + "epoch": 2.2106544077879935, + "grad_norm": 3.40625, + "learning_rate": 0.013106181548305482, + "loss": 3.3014, + "mean_token_accuracy": 0.3702034652233124, + "num_tokens": 4179135929.0, + "step": 8175 + }, + { + "epoch": 2.210924824229313, + "grad_norm": 16.0, + "learning_rate": 0.013104648803156753, + "loss": 2.7885, + "mean_token_accuracy": 0.4442906379699707, + "num_tokens": 4179660069.0, + "step": 8176 + }, + { + "epoch": 2.2111952406706328, + "grad_norm": 3.25, + "learning_rate": 0.013103115993431209, + "loss": 3.2536, + "mean_token_accuracy": 0.3795751929283142, + "num_tokens": 4180184277.0, + "step": 8177 + }, + { + "epoch": 2.2114656571119524, + "grad_norm": 2.671875, + "learning_rate": 0.013101583119175881, + "loss": 3.0588, + "mean_token_accuracy": 0.3870236277580261, + "num_tokens": 4180708446.0, + "step": 8178 + }, + { + "epoch": 2.211736073553272, + "grad_norm": 2.75, + "learning_rate": 0.013100050180437804, + "loss": 3.3562, + "mean_token_accuracy": 0.3758839964866638, + "num_tokens": 4181232731.0, + "step": 8179 + }, + { + "epoch": 2.2120064899945917, + "grad_norm": 3.15625, + "learning_rate": 0.013098517177264011, + "loss": 3.3818, + "mean_token_accuracy": 0.36621326208114624, + "num_tokens": 4181742816.0, + "step": 8180 + }, + { + "epoch": 2.2122769064359114, + "grad_norm": 2.671875, + "learning_rate": 0.01309698410970154, + "loss": 3.1115, + "mean_token_accuracy": 0.4091279208660126, + "num_tokens": 4182188562.0, + "step": 8181 + }, + { + "epoch": 2.212547322877231, + "grad_norm": 2.734375, + "learning_rate": 0.01309545097779743, + "loss": 3.1776, + "mean_token_accuracy": 0.3976914584636688, + "num_tokens": 4182676819.0, + "step": 8182 + }, + { + "epoch": 2.2128177393185506, + "grad_norm": 2.75, + "learning_rate": 0.013093917781598722, + "loss": 3.223, + "mean_token_accuracy": 0.39221620559692383, + "num_tokens": 4183201099.0, + "step": 8183 + }, + { + "epoch": 2.2130881557598703, + "grad_norm": 2.546875, + "learning_rate": 0.013092384521152457, + "loss": 3.198, + "mean_token_accuracy": 0.38909655809402466, + "num_tokens": 4183701202.0, + "step": 8184 + }, + { + "epoch": 2.21335857220119, + "grad_norm": 2.515625, + "learning_rate": 0.013090851196505684, + "loss": 3.1693, + "mean_token_accuracy": 0.42084354162216187, + "num_tokens": 4184162941.0, + "step": 8185 + }, + { + "epoch": 2.2136289886425096, + "grad_norm": 2.078125, + "learning_rate": 0.013089317807705444, + "loss": 2.991, + "mean_token_accuracy": 0.3980756998062134, + "num_tokens": 4184687181.0, + "step": 8186 + }, + { + "epoch": 2.213899405083829, + "grad_norm": 2.8125, + "learning_rate": 0.013087784354798791, + "loss": 2.9033, + "mean_token_accuracy": 0.4111783504486084, + "num_tokens": 4185211434.0, + "step": 8187 + }, + { + "epoch": 2.214169821525149, + "grad_norm": 3.578125, + "learning_rate": 0.013086250837832774, + "loss": 3.3363, + "mean_token_accuracy": 0.4084652364253998, + "num_tokens": 4185672023.0, + "step": 8188 + }, + { + "epoch": 2.2144402379664685, + "grad_norm": 3.484375, + "learning_rate": 0.013084717256854447, + "loss": 3.3002, + "mean_token_accuracy": 0.39134281873703003, + "num_tokens": 4186196122.0, + "step": 8189 + }, + { + "epoch": 2.214710654407788, + "grad_norm": 3.140625, + "learning_rate": 0.01308318361191086, + "loss": 3.2042, + "mean_token_accuracy": 0.39360326528549194, + "num_tokens": 4186720369.0, + "step": 8190 + }, + { + "epoch": 2.214981070849108, + "grad_norm": 47.5, + "learning_rate": 0.013081649903049082, + "loss": 15.9006, + "mean_token_accuracy": 0.0, + "num_tokens": 4187244634.0, + "step": 8191 + }, + { + "epoch": 2.2152514872904274, + "grad_norm": 8.125, + "learning_rate": 0.013080116130316155, + "loss": 3.3651, + "mean_token_accuracy": 0.392577588558197, + "num_tokens": 4187706201.0, + "step": 8192 + }, + { + "epoch": 2.215521903731747, + "grad_norm": 2.0625, + "learning_rate": 0.013078582293759149, + "loss": 3.1159, + "mean_token_accuracy": 0.3971728980541229, + "num_tokens": 4188230337.0, + "step": 8193 + }, + { + "epoch": 2.2157923201730667, + "grad_norm": 2.703125, + "learning_rate": 0.013077048393425128, + "loss": 3.3791, + "mean_token_accuracy": 0.37748217582702637, + "num_tokens": 4188754590.0, + "step": 8194 + }, + { + "epoch": 2.2160627366143864, + "grad_norm": 4.34375, + "learning_rate": 0.013075514429361154, + "loss": 3.3194, + "mean_token_accuracy": 0.38345885276794434, + "num_tokens": 4189278820.0, + "step": 8195 + }, + { + "epoch": 2.2163331530557056, + "grad_norm": 2.625, + "learning_rate": 0.013073980401614295, + "loss": 3.2575, + "mean_token_accuracy": 0.4038914442062378, + "num_tokens": 4189777567.0, + "step": 8196 + }, + { + "epoch": 2.2166035694970256, + "grad_norm": 4.09375, + "learning_rate": 0.013072446310231618, + "loss": 3.3833, + "mean_token_accuracy": 0.3756575584411621, + "num_tokens": 4190301796.0, + "step": 8197 + }, + { + "epoch": 2.216873985938345, + "grad_norm": 2.28125, + "learning_rate": 0.01307091215526019, + "loss": 3.1843, + "mean_token_accuracy": 0.3927305340766907, + "num_tokens": 4190825900.0, + "step": 8198 + }, + { + "epoch": 2.2171444023796645, + "grad_norm": 2.421875, + "learning_rate": 0.013069377936747086, + "loss": 3.2996, + "mean_token_accuracy": 0.4090683162212372, + "num_tokens": 4191285958.0, + "step": 8199 + }, + { + "epoch": 2.217414818820984, + "grad_norm": 2.421875, + "learning_rate": 0.013067843654739383, + "loss": 3.0697, + "mean_token_accuracy": 0.40159061551094055, + "num_tokens": 4191778091.0, + "step": 8200 + }, + { + "epoch": 2.2176852352623038, + "grad_norm": 2.734375, + "learning_rate": 0.013066309309284159, + "loss": 3.1959, + "mean_token_accuracy": 0.4005737900733948, + "num_tokens": 4192302066.0, + "step": 8201 + }, + { + "epoch": 2.2179556517036234, + "grad_norm": 3.546875, + "learning_rate": 0.013064774900428485, + "loss": 3.3243, + "mean_token_accuracy": 0.36648428440093994, + "num_tokens": 4192826315.0, + "step": 8202 + }, + { + "epoch": 2.218226068144943, + "grad_norm": 2.703125, + "learning_rate": 0.013063240428219445, + "loss": 3.1998, + "mean_token_accuracy": 0.40432071685791016, + "num_tokens": 4193348914.0, + "step": 8203 + }, + { + "epoch": 2.2184964845862627, + "grad_norm": 2.9375, + "learning_rate": 0.013061705892704126, + "loss": 3.2561, + "mean_token_accuracy": 0.3975895047187805, + "num_tokens": 4193812507.0, + "step": 8204 + }, + { + "epoch": 2.2187669010275823, + "grad_norm": 2.828125, + "learning_rate": 0.013060171293929604, + "loss": 3.2261, + "mean_token_accuracy": 0.38452059030532837, + "num_tokens": 4194336525.0, + "step": 8205 + }, + { + "epoch": 2.219037317468902, + "grad_norm": 2.671875, + "learning_rate": 0.013058636631942963, + "loss": 3.2209, + "mean_token_accuracy": 0.383553683757782, + "num_tokens": 4194860717.0, + "step": 8206 + }, + { + "epoch": 2.2193077339102216, + "grad_norm": 2.890625, + "learning_rate": 0.013057101906791302, + "loss": 3.1827, + "mean_token_accuracy": 0.41249334812164307, + "num_tokens": 4195347823.0, + "step": 8207 + }, + { + "epoch": 2.2195781503515413, + "grad_norm": 2.546875, + "learning_rate": 0.013055567118521703, + "loss": 3.2061, + "mean_token_accuracy": 0.38472896814346313, + "num_tokens": 4195871977.0, + "step": 8208 + }, + { + "epoch": 2.219848566792861, + "grad_norm": 2.234375, + "learning_rate": 0.01305403226718126, + "loss": 3.074, + "mean_token_accuracy": 0.4171339273452759, + "num_tokens": 4196366981.0, + "step": 8209 + }, + { + "epoch": 2.2201189832341806, + "grad_norm": 3.203125, + "learning_rate": 0.013052497352817067, + "loss": 2.9872, + "mean_token_accuracy": 0.39949291944503784, + "num_tokens": 4196891203.0, + "step": 8210 + }, + { + "epoch": 2.2203893996755, + "grad_norm": 104.5, + "learning_rate": 0.01305096237547622, + "loss": 16.6663, + "mean_token_accuracy": 0.0, + "num_tokens": 4197415460.0, + "step": 8211 + }, + { + "epoch": 2.22065981611682, + "grad_norm": 5.25, + "learning_rate": 0.013049427335205817, + "loss": 3.5596, + "mean_token_accuracy": 0.37119245529174805, + "num_tokens": 4197939498.0, + "step": 8212 + }, + { + "epoch": 2.2209302325581395, + "grad_norm": 1.8828125, + "learning_rate": 0.013047892232052953, + "loss": 3.2829, + "mean_token_accuracy": 0.4129514694213867, + "num_tokens": 4198404052.0, + "step": 8213 + }, + { + "epoch": 2.221200648999459, + "grad_norm": 2.84375, + "learning_rate": 0.013046357066064733, + "loss": 3.1595, + "mean_token_accuracy": 0.37857291102409363, + "num_tokens": 4198928264.0, + "step": 8214 + }, + { + "epoch": 2.2214710654407788, + "grad_norm": 2.90625, + "learning_rate": 0.013044821837288261, + "loss": 3.1975, + "mean_token_accuracy": 0.3879365622997284, + "num_tokens": 4199417354.0, + "step": 8215 + }, + { + "epoch": 2.2217414818820984, + "grad_norm": 2.8125, + "learning_rate": 0.013043286545770643, + "loss": 3.2463, + "mean_token_accuracy": 0.3719722032546997, + "num_tokens": 4199891156.0, + "step": 8216 + }, + { + "epoch": 2.222011898323418, + "grad_norm": 2.1875, + "learning_rate": 0.013041751191558983, + "loss": 2.9386, + "mean_token_accuracy": 0.4238576889038086, + "num_tokens": 4200384388.0, + "step": 8217 + }, + { + "epoch": 2.2222823147647377, + "grad_norm": 2.78125, + "learning_rate": 0.013040215774700395, + "loss": 3.1071, + "mean_token_accuracy": 0.37848976254463196, + "num_tokens": 4200908575.0, + "step": 8218 + }, + { + "epoch": 2.2225527312060573, + "grad_norm": 2.6875, + "learning_rate": 0.013038680295241982, + "loss": 2.9472, + "mean_token_accuracy": 0.43200355768203735, + "num_tokens": 4201393757.0, + "step": 8219 + }, + { + "epoch": 2.222823147647377, + "grad_norm": 2.671875, + "learning_rate": 0.013037144753230869, + "loss": 2.9959, + "mean_token_accuracy": 0.41515448689460754, + "num_tokens": 4201917981.0, + "step": 8220 + }, + { + "epoch": 2.2230935640886966, + "grad_norm": 2.921875, + "learning_rate": 0.013035609148714156, + "loss": 3.3565, + "mean_token_accuracy": 0.403582900762558, + "num_tokens": 4202442177.0, + "step": 8221 + }, + { + "epoch": 2.2233639805300163, + "grad_norm": 2.8125, + "learning_rate": 0.013034073481738974, + "loss": 3.0055, + "mean_token_accuracy": 0.43030640482902527, + "num_tokens": 4202903407.0, + "step": 8222 + }, + { + "epoch": 2.223634396971336, + "grad_norm": 2.6875, + "learning_rate": 0.013032537752352434, + "loss": 3.1647, + "mean_token_accuracy": 0.42028212547302246, + "num_tokens": 4203384625.0, + "step": 8223 + }, + { + "epoch": 2.2239048134126556, + "grad_norm": 3.40625, + "learning_rate": 0.013031001960601658, + "loss": 3.1504, + "mean_token_accuracy": 0.36616069078445435, + "num_tokens": 4203908709.0, + "step": 8224 + }, + { + "epoch": 2.224175229853975, + "grad_norm": 2.328125, + "learning_rate": 0.013029466106533771, + "loss": 3.1014, + "mean_token_accuracy": 0.40436455607414246, + "num_tokens": 4204407619.0, + "step": 8225 + }, + { + "epoch": 2.224445646295295, + "grad_norm": 2.84375, + "learning_rate": 0.013027930190195894, + "loss": 3.0984, + "mean_token_accuracy": 0.38727572560310364, + "num_tokens": 4204931847.0, + "step": 8226 + }, + { + "epoch": 2.2247160627366145, + "grad_norm": 2.4375, + "learning_rate": 0.013026394211635157, + "loss": 3.0234, + "mean_token_accuracy": 0.4017281234264374, + "num_tokens": 4205456063.0, + "step": 8227 + }, + { + "epoch": 2.224986479177934, + "grad_norm": 2.59375, + "learning_rate": 0.013024858170898681, + "loss": 2.9962, + "mean_token_accuracy": 0.4038270115852356, + "num_tokens": 4205980330.0, + "step": 8228 + }, + { + "epoch": 2.2252568956192538, + "grad_norm": 2.546875, + "learning_rate": 0.013023322068033607, + "loss": 3.0262, + "mean_token_accuracy": 0.3853406608104706, + "num_tokens": 4206504611.0, + "step": 8229 + }, + { + "epoch": 2.2255273120605734, + "grad_norm": 2.59375, + "learning_rate": 0.01302178590308706, + "loss": 3.1073, + "mean_token_accuracy": 0.3949428200721741, + "num_tokens": 4207028806.0, + "step": 8230 + }, + { + "epoch": 2.225797728501893, + "grad_norm": 14.0625, + "learning_rate": 0.013020249676106176, + "loss": 12.0116, + "mean_token_accuracy": 0.0, + "num_tokens": 4207534404.0, + "step": 8231 + }, + { + "epoch": 2.2260681449432127, + "grad_norm": 7.84375, + "learning_rate": 0.01301871338713809, + "loss": 3.6171, + "mean_token_accuracy": 0.3092174232006073, + "num_tokens": 4208058672.0, + "step": 8232 + }, + { + "epoch": 2.2263385613845323, + "grad_norm": 2.109375, + "learning_rate": 0.01301717703622994, + "loss": 3.2822, + "mean_token_accuracy": 0.3869543969631195, + "num_tokens": 4208579894.0, + "step": 8233 + }, + { + "epoch": 2.226608977825852, + "grad_norm": 2.546875, + "learning_rate": 0.013015640623428871, + "loss": 3.2325, + "mean_token_accuracy": 0.3887206017971039, + "num_tokens": 4209104173.0, + "step": 8234 + }, + { + "epoch": 2.2268793942671716, + "grad_norm": 3.46875, + "learning_rate": 0.013014104148782015, + "loss": 3.4752, + "mean_token_accuracy": 0.3673131465911865, + "num_tokens": 4209628365.0, + "step": 8235 + }, + { + "epoch": 2.2271498107084913, + "grad_norm": 3.546875, + "learning_rate": 0.013012567612336521, + "loss": 3.036, + "mean_token_accuracy": 0.4020254611968994, + "num_tokens": 4210152563.0, + "step": 8236 + }, + { + "epoch": 2.227420227149811, + "grad_norm": 3.4375, + "learning_rate": 0.013011031014139536, + "loss": 3.1964, + "mean_token_accuracy": 0.4147612750530243, + "num_tokens": 4210647647.0, + "step": 8237 + }, + { + "epoch": 2.2276906435911306, + "grad_norm": 3.0625, + "learning_rate": 0.013009494354238203, + "loss": 3.132, + "mean_token_accuracy": 0.3652728199958801, + "num_tokens": 4211171866.0, + "step": 8238 + }, + { + "epoch": 2.2279610600324498, + "grad_norm": 3.125, + "learning_rate": 0.013007957632679675, + "loss": 3.2205, + "mean_token_accuracy": 0.38713452219963074, + "num_tokens": 4211643076.0, + "step": 8239 + }, + { + "epoch": 2.2282314764737694, + "grad_norm": 2.953125, + "learning_rate": 0.013006420849511105, + "loss": 3.164, + "mean_token_accuracy": 0.3811541795730591, + "num_tokens": 4212167346.0, + "step": 8240 + }, + { + "epoch": 2.228501892915089, + "grad_norm": 3.03125, + "learning_rate": 0.013004884004779638, + "loss": 3.1434, + "mean_token_accuracy": 0.3962136209011078, + "num_tokens": 4212691588.0, + "step": 8241 + }, + { + "epoch": 2.2287723093564087, + "grad_norm": 3.453125, + "learning_rate": 0.013003347098532436, + "loss": 3.289, + "mean_token_accuracy": 0.3917643129825592, + "num_tokens": 4213214843.0, + "step": 8242 + }, + { + "epoch": 2.2290427257977283, + "grad_norm": 3.546875, + "learning_rate": 0.013001810130816653, + "loss": 3.0858, + "mean_token_accuracy": 0.4121280312538147, + "num_tokens": 4213645914.0, + "step": 8243 + }, + { + "epoch": 2.229313142239048, + "grad_norm": 2.8125, + "learning_rate": 0.013000273101679448, + "loss": 3.0873, + "mean_token_accuracy": 0.38892894983291626, + "num_tokens": 4214169966.0, + "step": 8244 + }, + { + "epoch": 2.2295835586803676, + "grad_norm": 2.96875, + "learning_rate": 0.012998736011167982, + "loss": 3.2202, + "mean_token_accuracy": 0.3864603340625763, + "num_tokens": 4214694245.0, + "step": 8245 + }, + { + "epoch": 2.2298539751216873, + "grad_norm": 4.125, + "learning_rate": 0.01299719885932942, + "loss": 3.3014, + "mean_token_accuracy": 0.39455193281173706, + "num_tokens": 4215177119.0, + "step": 8246 + }, + { + "epoch": 2.230124391563007, + "grad_norm": 2.484375, + "learning_rate": 0.012995661646210922, + "loss": 2.8741, + "mean_token_accuracy": 0.4092153310775757, + "num_tokens": 4215701369.0, + "step": 8247 + }, + { + "epoch": 2.2303948080043265, + "grad_norm": 2.984375, + "learning_rate": 0.012994124371859652, + "loss": 3.2103, + "mean_token_accuracy": 0.3861218988895416, + "num_tokens": 4216174664.0, + "step": 8248 + }, + { + "epoch": 2.230665224445646, + "grad_norm": 17.375, + "learning_rate": 0.012992587036322787, + "loss": 2.9222, + "mean_token_accuracy": 0.4568628668785095, + "num_tokens": 4216698876.0, + "step": 8249 + }, + { + "epoch": 2.230935640886966, + "grad_norm": 3.046875, + "learning_rate": 0.012991049639647487, + "loss": 3.2325, + "mean_token_accuracy": 0.3877304792404175, + "num_tokens": 4217223057.0, + "step": 8250 + }, + { + "epoch": 2.2312060573282855, + "grad_norm": 3.984375, + "learning_rate": 0.012989512181880934, + "loss": 10.9695, + "mean_token_accuracy": 1.6571444575674832e-05, + "num_tokens": 4217747193.0, + "step": 8251 + }, + { + "epoch": 2.231476473769605, + "grad_norm": 5.25, + "learning_rate": 0.012987974663070294, + "loss": 3.7872, + "mean_token_accuracy": 0.3310632109642029, + "num_tokens": 4218271463.0, + "step": 8252 + }, + { + "epoch": 2.2317468902109248, + "grad_norm": 2.640625, + "learning_rate": 0.012986437083262746, + "loss": 2.992, + "mean_token_accuracy": 0.4242134392261505, + "num_tokens": 4218731984.0, + "step": 8253 + }, + { + "epoch": 2.2320173066522444, + "grad_norm": 2.125, + "learning_rate": 0.012984899442505464, + "loss": 3.3767, + "mean_token_accuracy": 0.3807605504989624, + "num_tokens": 4219256268.0, + "step": 8254 + }, + { + "epoch": 2.232287723093564, + "grad_norm": 3.296875, + "learning_rate": 0.012983361740845634, + "loss": 3.3866, + "mean_token_accuracy": 0.3757803440093994, + "num_tokens": 4219780532.0, + "step": 8255 + }, + { + "epoch": 2.2325581395348837, + "grad_norm": 2.8125, + "learning_rate": 0.012981823978330432, + "loss": 3.3861, + "mean_token_accuracy": 0.3778300881385803, + "num_tokens": 4220304753.0, + "step": 8256 + }, + { + "epoch": 2.2328285559762033, + "grad_norm": 3.3125, + "learning_rate": 0.012980286155007037, + "loss": 3.1541, + "mean_token_accuracy": 0.41755592823028564, + "num_tokens": 4220829015.0, + "step": 8257 + }, + { + "epoch": 2.233098972417523, + "grad_norm": 3.546875, + "learning_rate": 0.012978748270922644, + "loss": 3.2903, + "mean_token_accuracy": 0.3865484595298767, + "num_tokens": 4221261853.0, + "step": 8258 + }, + { + "epoch": 2.2333693888588426, + "grad_norm": 3.765625, + "learning_rate": 0.012977210326124436, + "loss": 3.4586, + "mean_token_accuracy": 0.35249021649360657, + "num_tokens": 4221786031.0, + "step": 8259 + }, + { + "epoch": 2.2336398053001623, + "grad_norm": 2.515625, + "learning_rate": 0.012975672320659597, + "loss": 3.0558, + "mean_token_accuracy": 0.3969706594944, + "num_tokens": 4222310200.0, + "step": 8260 + }, + { + "epoch": 2.233910221741482, + "grad_norm": 3.421875, + "learning_rate": 0.012974134254575323, + "loss": 3.3287, + "mean_token_accuracy": 0.39392662048339844, + "num_tokens": 4222818939.0, + "step": 8261 + }, + { + "epoch": 2.2341806381828015, + "grad_norm": 3.015625, + "learning_rate": 0.012972596127918805, + "loss": 3.1429, + "mean_token_accuracy": 0.3781735599040985, + "num_tokens": 4223306253.0, + "step": 8262 + }, + { + "epoch": 2.234451054624121, + "grad_norm": 2.8125, + "learning_rate": 0.012971057940737234, + "loss": 3.2468, + "mean_token_accuracy": 0.38281798362731934, + "num_tokens": 4223830406.0, + "step": 8263 + }, + { + "epoch": 2.234721471065441, + "grad_norm": 11.75, + "learning_rate": 0.012969519693077811, + "loss": 3.2282, + "mean_token_accuracy": 0.3896741569042206, + "num_tokens": 4224354627.0, + "step": 8264 + }, + { + "epoch": 2.2349918875067605, + "grad_norm": 2.375, + "learning_rate": 0.012967981384987733, + "loss": 3.1646, + "mean_token_accuracy": 0.38277295231819153, + "num_tokens": 4224878894.0, + "step": 8265 + }, + { + "epoch": 2.23526230394808, + "grad_norm": 3.84375, + "learning_rate": 0.0129664430165142, + "loss": 3.3546, + "mean_token_accuracy": 0.3704906702041626, + "num_tokens": 4225401995.0, + "step": 8266 + }, + { + "epoch": 2.2355327203893998, + "grad_norm": 3.390625, + "learning_rate": 0.012964904587704407, + "loss": 3.2267, + "mean_token_accuracy": 0.4076686501502991, + "num_tokens": 4225891369.0, + "step": 8267 + }, + { + "epoch": 2.2358031368307194, + "grad_norm": 2.6875, + "learning_rate": 0.012963366098605565, + "loss": 3.1769, + "mean_token_accuracy": 0.3852105140686035, + "num_tokens": 4226397050.0, + "step": 8268 + }, + { + "epoch": 2.236073553272039, + "grad_norm": 3.078125, + "learning_rate": 0.012961827549264879, + "loss": 3.2839, + "mean_token_accuracy": 0.38705793023109436, + "num_tokens": 4226921317.0, + "step": 8269 + }, + { + "epoch": 2.2363439697133587, + "grad_norm": 2.4375, + "learning_rate": 0.012960288939729555, + "loss": 2.8815, + "mean_token_accuracy": 0.4058331251144409, + "num_tokens": 4227445570.0, + "step": 8270 + }, + { + "epoch": 2.2366143861546783, + "grad_norm": 37.0, + "learning_rate": 0.012958750270046797, + "loss": 13.0657, + "mean_token_accuracy": 0.016357630491256714, + "num_tokens": 4227969848.0, + "step": 8271 + }, + { + "epoch": 2.236884802595998, + "grad_norm": 6.3125, + "learning_rate": 0.012957211540263823, + "loss": 3.7337, + "mean_token_accuracy": 0.36937543749809265, + "num_tokens": 4228494124.0, + "step": 8272 + }, + { + "epoch": 2.2371552190373176, + "grad_norm": 2.21875, + "learning_rate": 0.012955672750427845, + "loss": 3.3822, + "mean_token_accuracy": 0.3749486207962036, + "num_tokens": 4229009413.0, + "step": 8273 + }, + { + "epoch": 2.2374256354786373, + "grad_norm": 2.90625, + "learning_rate": 0.012954133900586074, + "loss": 3.1607, + "mean_token_accuracy": 0.39298152923583984, + "num_tokens": 4229525278.0, + "step": 8274 + }, + { + "epoch": 2.237696051919957, + "grad_norm": 2.890625, + "learning_rate": 0.012952594990785727, + "loss": 3.1479, + "mean_token_accuracy": 0.3933361768722534, + "num_tokens": 4230049553.0, + "step": 8275 + }, + { + "epoch": 2.2379664683612766, + "grad_norm": 3.640625, + "learning_rate": 0.012951056021074029, + "loss": 3.0902, + "mean_token_accuracy": 0.38826626539230347, + "num_tokens": 4230573713.0, + "step": 8276 + }, + { + "epoch": 2.238236884802596, + "grad_norm": 2.421875, + "learning_rate": 0.012949516991498189, + "loss": 2.981, + "mean_token_accuracy": 0.3860965371131897, + "num_tokens": 4231097940.0, + "step": 8277 + }, + { + "epoch": 2.238507301243916, + "grad_norm": 2.765625, + "learning_rate": 0.012947977902105431, + "loss": 3.2642, + "mean_token_accuracy": 0.3812848925590515, + "num_tokens": 4231622158.0, + "step": 8278 + }, + { + "epoch": 2.2387777176852355, + "grad_norm": 2.53125, + "learning_rate": 0.01294643875294299, + "loss": 3.1412, + "mean_token_accuracy": 0.3972821533679962, + "num_tokens": 4232146427.0, + "step": 8279 + }, + { + "epoch": 2.2390481341265547, + "grad_norm": 2.953125, + "learning_rate": 0.012944899544058083, + "loss": 3.2561, + "mean_token_accuracy": 0.39310911297798157, + "num_tokens": 4232670610.0, + "step": 8280 + }, + { + "epoch": 2.2393185505678743, + "grad_norm": 2.703125, + "learning_rate": 0.012943360275497934, + "loss": 3.1238, + "mean_token_accuracy": 0.3896051049232483, + "num_tokens": 4233194814.0, + "step": 8281 + }, + { + "epoch": 2.239588967009194, + "grad_norm": 3.28125, + "learning_rate": 0.012941820947309779, + "loss": 2.982, + "mean_token_accuracy": 0.40235576033592224, + "num_tokens": 4233682366.0, + "step": 8282 + }, + { + "epoch": 2.2398593834505136, + "grad_norm": 2.3125, + "learning_rate": 0.012940281559540849, + "loss": 2.9875, + "mean_token_accuracy": 0.40271425247192383, + "num_tokens": 4234206636.0, + "step": 8283 + }, + { + "epoch": 2.2401297998918333, + "grad_norm": 3.6875, + "learning_rate": 0.012938742112238373, + "loss": 3.1235, + "mean_token_accuracy": 0.3850928544998169, + "num_tokens": 4234730913.0, + "step": 8284 + }, + { + "epoch": 2.240400216333153, + "grad_norm": 3.609375, + "learning_rate": 0.012937202605449588, + "loss": 3.1918, + "mean_token_accuracy": 0.383539080619812, + "num_tokens": 4235254998.0, + "step": 8285 + }, + { + "epoch": 2.2406706327744725, + "grad_norm": 3.203125, + "learning_rate": 0.012935663039221728, + "loss": 3.0448, + "mean_token_accuracy": 0.41144803166389465, + "num_tokens": 4235742636.0, + "step": 8286 + }, + { + "epoch": 2.240941049215792, + "grad_norm": 2.40625, + "learning_rate": 0.012934123413602035, + "loss": 3.2108, + "mean_token_accuracy": 0.4123125374317169, + "num_tokens": 4236224008.0, + "step": 8287 + }, + { + "epoch": 2.241211465657112, + "grad_norm": 2.390625, + "learning_rate": 0.012932583728637748, + "loss": 3.182, + "mean_token_accuracy": 0.3917047381401062, + "num_tokens": 4236748184.0, + "step": 8288 + }, + { + "epoch": 2.2414818820984315, + "grad_norm": 2.671875, + "learning_rate": 0.01293104398437611, + "loss": 3.0155, + "mean_token_accuracy": 0.43688997626304626, + "num_tokens": 4237272397.0, + "step": 8289 + }, + { + "epoch": 2.241752298539751, + "grad_norm": 2.40625, + "learning_rate": 0.012929504180864361, + "loss": 3.0455, + "mean_token_accuracy": 0.4084007740020752, + "num_tokens": 4237727829.0, + "step": 8290 + }, + { + "epoch": 2.2420227149810708, + "grad_norm": 37.0, + "learning_rate": 0.01292796431814975, + "loss": 11.5881, + "mean_token_accuracy": 0.003015092806890607, + "num_tokens": 4238252001.0, + "step": 8291 + }, + { + "epoch": 2.2422931314223904, + "grad_norm": 8.1875, + "learning_rate": 0.012926424396279525, + "loss": 3.501, + "mean_token_accuracy": 0.3354962468147278, + "num_tokens": 4238776242.0, + "step": 8292 + }, + { + "epoch": 2.24256354786371, + "grad_norm": 2.4375, + "learning_rate": 0.012924884415300932, + "loss": 3.2111, + "mean_token_accuracy": 0.3902304768562317, + "num_tokens": 4239300430.0, + "step": 8293 + }, + { + "epoch": 2.2428339643050297, + "grad_norm": 2.78125, + "learning_rate": 0.012923344375261228, + "loss": 3.4198, + "mean_token_accuracy": 0.36710578203201294, + "num_tokens": 4239824616.0, + "step": 8294 + }, + { + "epoch": 2.2431043807463493, + "grad_norm": 4.28125, + "learning_rate": 0.012921804276207661, + "loss": 3.3339, + "mean_token_accuracy": 0.40981829166412354, + "num_tokens": 4240348780.0, + "step": 8295 + }, + { + "epoch": 2.243374797187669, + "grad_norm": 2.890625, + "learning_rate": 0.01292026411818749, + "loss": 3.2072, + "mean_token_accuracy": 0.388983815908432, + "num_tokens": 4240872922.0, + "step": 8296 + }, + { + "epoch": 2.2436452136289886, + "grad_norm": 3.03125, + "learning_rate": 0.012918723901247963, + "loss": 3.2898, + "mean_token_accuracy": 0.38374483585357666, + "num_tokens": 4241397139.0, + "step": 8297 + }, + { + "epoch": 2.2439156300703083, + "grad_norm": 2.5625, + "learning_rate": 0.012917183625436346, + "loss": 3.1642, + "mean_token_accuracy": 0.3906000554561615, + "num_tokens": 4241921423.0, + "step": 8298 + }, + { + "epoch": 2.244186046511628, + "grad_norm": 2.71875, + "learning_rate": 0.012915643290799899, + "loss": 3.1216, + "mean_token_accuracy": 0.399539977312088, + "num_tokens": 4242392696.0, + "step": 8299 + }, + { + "epoch": 2.2444564629529475, + "grad_norm": 3.53125, + "learning_rate": 0.012914102897385883, + "loss": 2.8457, + "mean_token_accuracy": 0.4217350482940674, + "num_tokens": 4242855879.0, + "step": 8300 + }, + { + "epoch": 2.244726879394267, + "grad_norm": 2.25, + "learning_rate": 0.012912562445241558, + "loss": 3.1354, + "mean_token_accuracy": 0.40268412232398987, + "num_tokens": 4243380109.0, + "step": 8301 + }, + { + "epoch": 2.244997295835587, + "grad_norm": 2.703125, + "learning_rate": 0.012911021934414195, + "loss": 2.9752, + "mean_token_accuracy": 0.4057832360267639, + "num_tokens": 4243885640.0, + "step": 8302 + }, + { + "epoch": 2.2452677122769065, + "grad_norm": 2.5625, + "learning_rate": 0.012909481364951056, + "loss": 2.9857, + "mean_token_accuracy": 0.4144440293312073, + "num_tokens": 4244409862.0, + "step": 8303 + }, + { + "epoch": 2.245538128718226, + "grad_norm": 2.390625, + "learning_rate": 0.012907940736899418, + "loss": 3.1554, + "mean_token_accuracy": 0.41496291756629944, + "num_tokens": 4244933983.0, + "step": 8304 + }, + { + "epoch": 2.2458085451595458, + "grad_norm": 3.921875, + "learning_rate": 0.012906400050306544, + "loss": 3.1452, + "mean_token_accuracy": 0.40923768281936646, + "num_tokens": 4245424987.0, + "step": 8305 + }, + { + "epoch": 2.2460789616008654, + "grad_norm": 3.015625, + "learning_rate": 0.012904859305219712, + "loss": 3.2952, + "mean_token_accuracy": 0.4065150320529938, + "num_tokens": 4245949223.0, + "step": 8306 + }, + { + "epoch": 2.246349378042185, + "grad_norm": 3.546875, + "learning_rate": 0.012903318501686191, + "loss": 3.16, + "mean_token_accuracy": 0.3687146306037903, + "num_tokens": 4246473472.0, + "step": 8307 + }, + { + "epoch": 2.2466197944835047, + "grad_norm": 2.40625, + "learning_rate": 0.012901777639753266, + "loss": 2.9675, + "mean_token_accuracy": 0.4234049320220947, + "num_tokens": 4246959493.0, + "step": 8308 + }, + { + "epoch": 2.2468902109248243, + "grad_norm": 3.09375, + "learning_rate": 0.012900236719468208, + "loss": 3.0858, + "mean_token_accuracy": 0.39098986983299255, + "num_tokens": 4247457244.0, + "step": 8309 + }, + { + "epoch": 2.247160627366144, + "grad_norm": 2.265625, + "learning_rate": 0.0128986957408783, + "loss": 3.1364, + "mean_token_accuracy": 0.398298978805542, + "num_tokens": 4247981407.0, + "step": 8310 + }, + { + "epoch": 2.2474310438074636, + "grad_norm": 4.59375, + "learning_rate": 0.012897154704030822, + "loss": 11.4976, + "mean_token_accuracy": 0.0002515866362955421, + "num_tokens": 4248470964.0, + "step": 8311 + }, + { + "epoch": 2.2477014602487833, + "grad_norm": 8.0625, + "learning_rate": 0.012895613608973058, + "loss": 3.703, + "mean_token_accuracy": 0.33438849449157715, + "num_tokens": 4248995129.0, + "step": 8312 + }, + { + "epoch": 2.247971876690103, + "grad_norm": 2.265625, + "learning_rate": 0.012894072455752295, + "loss": 3.2189, + "mean_token_accuracy": 0.3750641942024231, + "num_tokens": 4249519388.0, + "step": 8313 + }, + { + "epoch": 2.2482422931314225, + "grad_norm": 2.765625, + "learning_rate": 0.012892531244415818, + "loss": 3.2957, + "mean_token_accuracy": 0.379618376493454, + "num_tokens": 4250043593.0, + "step": 8314 + }, + { + "epoch": 2.248512709572742, + "grad_norm": 3.71875, + "learning_rate": 0.012890989975010918, + "loss": 3.3344, + "mean_token_accuracy": 0.38885778188705444, + "num_tokens": 4250567859.0, + "step": 8315 + }, + { + "epoch": 2.248783126014062, + "grad_norm": 2.6875, + "learning_rate": 0.012889448647584885, + "loss": 3.3141, + "mean_token_accuracy": 0.38254451751708984, + "num_tokens": 4251047432.0, + "step": 8316 + }, + { + "epoch": 2.2490535424553815, + "grad_norm": 2.75, + "learning_rate": 0.012887907262185009, + "loss": 3.1268, + "mean_token_accuracy": 0.39632555842399597, + "num_tokens": 4251571582.0, + "step": 8317 + }, + { + "epoch": 2.249323958896701, + "grad_norm": 2.828125, + "learning_rate": 0.012886365818858587, + "loss": 3.1348, + "mean_token_accuracy": 0.3917925953865051, + "num_tokens": 4252095847.0, + "step": 8318 + }, + { + "epoch": 2.2495943753380208, + "grad_norm": 2.765625, + "learning_rate": 0.012884824317652914, + "loss": 3.1226, + "mean_token_accuracy": 0.3875664472579956, + "num_tokens": 4252619977.0, + "step": 8319 + }, + { + "epoch": 2.2498647917793404, + "grad_norm": 3.046875, + "learning_rate": 0.01288328275861529, + "loss": 3.3372, + "mean_token_accuracy": 0.3906736671924591, + "num_tokens": 4253144179.0, + "step": 8320 + }, + { + "epoch": 2.2501352082206596, + "grad_norm": 2.59375, + "learning_rate": 0.012881741141793008, + "loss": 3.1927, + "mean_token_accuracy": 0.4036809206008911, + "num_tokens": 4253668445.0, + "step": 8321 + }, + { + "epoch": 2.2504056246619797, + "grad_norm": 2.765625, + "learning_rate": 0.012880199467233377, + "loss": 2.9321, + "mean_token_accuracy": 0.417949378490448, + "num_tokens": 4254187206.0, + "step": 8322 + }, + { + "epoch": 2.250676041103299, + "grad_norm": 3.15625, + "learning_rate": 0.012878657734983699, + "loss": 3.2983, + "mean_token_accuracy": 0.36578869819641113, + "num_tokens": 4254711469.0, + "step": 8323 + }, + { + "epoch": 2.2509464575446185, + "grad_norm": 4.21875, + "learning_rate": 0.012877115945091276, + "loss": 3.1337, + "mean_token_accuracy": 0.3813011348247528, + "num_tokens": 4255235753.0, + "step": 8324 + }, + { + "epoch": 2.251216873985938, + "grad_norm": 2.1875, + "learning_rate": 0.012875574097603412, + "loss": 3.2247, + "mean_token_accuracy": 0.3762893080711365, + "num_tokens": 4255760017.0, + "step": 8325 + }, + { + "epoch": 2.251487290427258, + "grad_norm": 3.375, + "learning_rate": 0.012874032192567423, + "loss": 3.1756, + "mean_token_accuracy": 0.40214449167251587, + "num_tokens": 4256284253.0, + "step": 8326 + }, + { + "epoch": 2.2517577068685775, + "grad_norm": 2.71875, + "learning_rate": 0.012872490230030613, + "loss": 3.1338, + "mean_token_accuracy": 0.3982281982898712, + "num_tokens": 4256801477.0, + "step": 8327 + }, + { + "epoch": 2.252028123309897, + "grad_norm": 2.734375, + "learning_rate": 0.012870948210040294, + "loss": 3.0828, + "mean_token_accuracy": 0.39932015538215637, + "num_tokens": 4257325690.0, + "step": 8328 + }, + { + "epoch": 2.2522985397512167, + "grad_norm": 2.828125, + "learning_rate": 0.012869406132643788, + "loss": 3.1676, + "mean_token_accuracy": 0.3970738351345062, + "num_tokens": 4257849918.0, + "step": 8329 + }, + { + "epoch": 2.2525689561925364, + "grad_norm": 3.046875, + "learning_rate": 0.012867863997888399, + "loss": 3.1189, + "mean_token_accuracy": 0.41366046667099, + "num_tokens": 4258351707.0, + "step": 8330 + }, + { + "epoch": 2.252839372633856, + "grad_norm": 92.5, + "learning_rate": 0.012866321805821451, + "loss": 19.6549, + "mean_token_accuracy": 0.03616795688867569, + "num_tokens": 4258849180.0, + "step": 8331 + }, + { + "epoch": 2.2531097890751757, + "grad_norm": 5.4375, + "learning_rate": 0.012864779556490263, + "loss": 3.5312, + "mean_token_accuracy": 0.3555832505226135, + "num_tokens": 4259350497.0, + "step": 8332 + }, + { + "epoch": 2.2533802055164953, + "grad_norm": 2.625, + "learning_rate": 0.012863237249942152, + "loss": 3.3612, + "mean_token_accuracy": 0.3824923634529114, + "num_tokens": 4259874765.0, + "step": 8333 + }, + { + "epoch": 2.253650621957815, + "grad_norm": 2.609375, + "learning_rate": 0.012861694886224444, + "loss": 3.0218, + "mean_token_accuracy": 0.40386301279067993, + "num_tokens": 4260398993.0, + "step": 8334 + }, + { + "epoch": 2.2539210383991346, + "grad_norm": 2.421875, + "learning_rate": 0.012860152465384463, + "loss": 3.3846, + "mean_token_accuracy": 0.37639713287353516, + "num_tokens": 4260923171.0, + "step": 8335 + }, + { + "epoch": 2.2541914548404542, + "grad_norm": 2.46875, + "learning_rate": 0.012858609987469532, + "loss": 3.1996, + "mean_token_accuracy": 0.42053714394569397, + "num_tokens": 4261384461.0, + "step": 8336 + }, + { + "epoch": 2.254461871281774, + "grad_norm": 2.796875, + "learning_rate": 0.012857067452526981, + "loss": 3.2553, + "mean_token_accuracy": 0.40590041875839233, + "num_tokens": 4261872285.0, + "step": 8337 + }, + { + "epoch": 2.2547322877230935, + "grad_norm": 3.046875, + "learning_rate": 0.012855524860604142, + "loss": 3.0614, + "mean_token_accuracy": 0.40488383173942566, + "num_tokens": 4262396481.0, + "step": 8338 + }, + { + "epoch": 2.255002704164413, + "grad_norm": 2.71875, + "learning_rate": 0.012853982211748341, + "loss": 3.1474, + "mean_token_accuracy": 0.41878536343574524, + "num_tokens": 4262860808.0, + "step": 8339 + }, + { + "epoch": 2.255273120605733, + "grad_norm": 3.109375, + "learning_rate": 0.012852439506006912, + "loss": 3.2369, + "mean_token_accuracy": 0.3894907236099243, + "num_tokens": 4263384893.0, + "step": 8340 + }, + { + "epoch": 2.2555435370470525, + "grad_norm": 2.859375, + "learning_rate": 0.012850896743427194, + "loss": 3.1496, + "mean_token_accuracy": 0.3978513479232788, + "num_tokens": 4263909157.0, + "step": 8341 + }, + { + "epoch": 2.255813953488372, + "grad_norm": 3.375, + "learning_rate": 0.01284935392405652, + "loss": 2.9608, + "mean_token_accuracy": 0.41149401664733887, + "num_tokens": 4264433440.0, + "step": 8342 + }, + { + "epoch": 2.2560843699296917, + "grad_norm": 4.21875, + "learning_rate": 0.012847811047942229, + "loss": 3.1026, + "mean_token_accuracy": 0.4253040552139282, + "num_tokens": 4264957634.0, + "step": 8343 + }, + { + "epoch": 2.2563547863710114, + "grad_norm": 3.359375, + "learning_rate": 0.01284626811513166, + "loss": 3.2211, + "mean_token_accuracy": 0.4053798317909241, + "num_tokens": 4265481904.0, + "step": 8344 + }, + { + "epoch": 2.256625202812331, + "grad_norm": 3.015625, + "learning_rate": 0.012844725125672158, + "loss": 3.1238, + "mean_token_accuracy": 0.3868303894996643, + "num_tokens": 4266006116.0, + "step": 8345 + }, + { + "epoch": 2.2568956192536507, + "grad_norm": 2.6875, + "learning_rate": 0.012843182079611062, + "loss": 3.2456, + "mean_token_accuracy": 0.40979641675949097, + "num_tokens": 4266507451.0, + "step": 8346 + }, + { + "epoch": 2.2571660356949703, + "grad_norm": 3.671875, + "learning_rate": 0.012841638976995723, + "loss": 3.1553, + "mean_token_accuracy": 0.39225614070892334, + "num_tokens": 4267031614.0, + "step": 8347 + }, + { + "epoch": 2.25743645213629, + "grad_norm": 2.84375, + "learning_rate": 0.01284009581787348, + "loss": 3.0624, + "mean_token_accuracy": 0.41358619928359985, + "num_tokens": 4267533093.0, + "step": 8348 + }, + { + "epoch": 2.2577068685776096, + "grad_norm": 4.0, + "learning_rate": 0.012838552602291687, + "loss": 3.0887, + "mean_token_accuracy": 0.38188135623931885, + "num_tokens": 4268057154.0, + "step": 8349 + }, + { + "epoch": 2.2579772850189292, + "grad_norm": 2.515625, + "learning_rate": 0.012837009330297694, + "loss": 3.1162, + "mean_token_accuracy": 0.4027710556983948, + "num_tokens": 4268581351.0, + "step": 8350 + }, + { + "epoch": 2.258247701460249, + "grad_norm": 75.5, + "learning_rate": 0.012835466001938849, + "loss": 15.5391, + "mean_token_accuracy": 0.00024015945382416248, + "num_tokens": 4269025412.0, + "step": 8351 + }, + { + "epoch": 2.2585181179015685, + "grad_norm": 5.8125, + "learning_rate": 0.012833922617262515, + "loss": 3.6505, + "mean_token_accuracy": 0.34495726227760315, + "num_tokens": 4269549606.0, + "step": 8352 + }, + { + "epoch": 2.258788534342888, + "grad_norm": 1.921875, + "learning_rate": 0.012832379176316038, + "loss": 3.2524, + "mean_token_accuracy": 0.3964289724826813, + "num_tokens": 4270048673.0, + "step": 8353 + }, + { + "epoch": 2.259058950784208, + "grad_norm": 2.28125, + "learning_rate": 0.012830835679146781, + "loss": 2.9894, + "mean_token_accuracy": 0.4117175340652466, + "num_tokens": 4270553770.0, + "step": 8354 + }, + { + "epoch": 2.2593293672255275, + "grad_norm": 2.96875, + "learning_rate": 0.012829292125802104, + "loss": 3.0417, + "mean_token_accuracy": 0.42815154790878296, + "num_tokens": 4271023491.0, + "step": 8355 + }, + { + "epoch": 2.259599783666847, + "grad_norm": 2.203125, + "learning_rate": 0.012827748516329358, + "loss": 3.2313, + "mean_token_accuracy": 0.41168421506881714, + "num_tokens": 4271547748.0, + "step": 8356 + }, + { + "epoch": 2.2598702001081667, + "grad_norm": 2.78125, + "learning_rate": 0.012826204850775916, + "loss": 3.1302, + "mean_token_accuracy": 0.40292853116989136, + "num_tokens": 4272071942.0, + "step": 8357 + }, + { + "epoch": 2.2601406165494864, + "grad_norm": 2.21875, + "learning_rate": 0.012824661129189141, + "loss": 3.1828, + "mean_token_accuracy": 0.41455572843551636, + "num_tokens": 4272596226.0, + "step": 8358 + }, + { + "epoch": 2.260411032990806, + "grad_norm": 3.515625, + "learning_rate": 0.012823117351616396, + "loss": 3.2149, + "mean_token_accuracy": 0.4100807309150696, + "num_tokens": 4273055085.0, + "step": 8359 + }, + { + "epoch": 2.2606814494321252, + "grad_norm": 2.515625, + "learning_rate": 0.012821573518105048, + "loss": 3.106, + "mean_token_accuracy": 0.3947094678878784, + "num_tokens": 4273579369.0, + "step": 8360 + }, + { + "epoch": 2.2609518658734453, + "grad_norm": 4.71875, + "learning_rate": 0.012820029628702465, + "loss": 3.1299, + "mean_token_accuracy": 0.39358842372894287, + "num_tokens": 4274103598.0, + "step": 8361 + }, + { + "epoch": 2.2612222823147645, + "grad_norm": 3.296875, + "learning_rate": 0.012818485683456025, + "loss": 2.9761, + "mean_token_accuracy": 0.4080020785331726, + "num_tokens": 4274627750.0, + "step": 8362 + }, + { + "epoch": 2.2614926987560846, + "grad_norm": 4.28125, + "learning_rate": 0.012816941682413093, + "loss": 3.0873, + "mean_token_accuracy": 0.38575977087020874, + "num_tokens": 4275151931.0, + "step": 8363 + }, + { + "epoch": 2.261763115197404, + "grad_norm": 2.75, + "learning_rate": 0.012815397625621047, + "loss": 3.1857, + "mean_token_accuracy": 0.3748438358306885, + "num_tokens": 4275676173.0, + "step": 8364 + }, + { + "epoch": 2.2620335316387234, + "grad_norm": 3.09375, + "learning_rate": 0.012813853513127265, + "loss": 3.1736, + "mean_token_accuracy": 0.388860285282135, + "num_tokens": 4276200322.0, + "step": 8365 + }, + { + "epoch": 2.262303948080043, + "grad_norm": 2.734375, + "learning_rate": 0.012812309344979123, + "loss": 3.1474, + "mean_token_accuracy": 0.38627365231513977, + "num_tokens": 4276724590.0, + "step": 8366 + }, + { + "epoch": 2.2625743645213627, + "grad_norm": 2.6875, + "learning_rate": 0.012810765121224, + "loss": 3.1303, + "mean_token_accuracy": 0.3851609230041504, + "num_tokens": 4277248866.0, + "step": 8367 + }, + { + "epoch": 2.2628447809626824, + "grad_norm": 2.8125, + "learning_rate": 0.01280922084190928, + "loss": 2.9006, + "mean_token_accuracy": 0.3999481201171875, + "num_tokens": 4277709479.0, + "step": 8368 + }, + { + "epoch": 2.263115197404002, + "grad_norm": 2.234375, + "learning_rate": 0.012807676507082342, + "loss": 3.146, + "mean_token_accuracy": 0.38482365012168884, + "num_tokens": 4278233661.0, + "step": 8369 + }, + { + "epoch": 2.2633856138453217, + "grad_norm": 2.578125, + "learning_rate": 0.01280613211679057, + "loss": 3.2187, + "mean_token_accuracy": 0.39363911747932434, + "num_tokens": 4278757793.0, + "step": 8370 + }, + { + "epoch": 2.2636560302866413, + "grad_norm": 7.78125, + "learning_rate": 0.012804587671081356, + "loss": 11.539, + "mean_token_accuracy": 6.6539532781462185e-06, + "num_tokens": 4279281945.0, + "step": 8371 + }, + { + "epoch": 2.263926446727961, + "grad_norm": 6.75, + "learning_rate": 0.012803043170002087, + "loss": 3.5563, + "mean_token_accuracy": 0.3789730370044708, + "num_tokens": 4279806208.0, + "step": 8372 + }, + { + "epoch": 2.2641968631692806, + "grad_norm": 3.109375, + "learning_rate": 0.01280149861360015, + "loss": 3.1896, + "mean_token_accuracy": 0.35550302267074585, + "num_tokens": 4280330397.0, + "step": 8373 + }, + { + "epoch": 2.2644672796106002, + "grad_norm": 2.578125, + "learning_rate": 0.012799954001922936, + "loss": 3.1445, + "mean_token_accuracy": 0.3932272791862488, + "num_tokens": 4280854632.0, + "step": 8374 + }, + { + "epoch": 2.26473769605192, + "grad_norm": 2.90625, + "learning_rate": 0.01279840933501784, + "loss": 3.1659, + "mean_token_accuracy": 0.38152971863746643, + "num_tokens": 4281378829.0, + "step": 8375 + }, + { + "epoch": 2.2650081124932395, + "grad_norm": 2.09375, + "learning_rate": 0.012796864612932257, + "loss": 3.0797, + "mean_token_accuracy": 0.41681480407714844, + "num_tokens": 4281903094.0, + "step": 8376 + }, + { + "epoch": 2.265278528934559, + "grad_norm": 3.640625, + "learning_rate": 0.012795319835713583, + "loss": 3.3028, + "mean_token_accuracy": 0.37491628527641296, + "num_tokens": 4282427374.0, + "step": 8377 + }, + { + "epoch": 2.265548945375879, + "grad_norm": 2.921875, + "learning_rate": 0.012793775003409221, + "loss": 3.2687, + "mean_token_accuracy": 0.38492435216903687, + "num_tokens": 4282951551.0, + "step": 8378 + }, + { + "epoch": 2.2658193618171985, + "grad_norm": 3.234375, + "learning_rate": 0.01279223011606656, + "loss": 3.1543, + "mean_token_accuracy": 0.4016662836074829, + "num_tokens": 4283436864.0, + "step": 8379 + }, + { + "epoch": 2.266089778258518, + "grad_norm": 2.859375, + "learning_rate": 0.012790685173733014, + "loss": 3.1513, + "mean_token_accuracy": 0.392391562461853, + "num_tokens": 4283961135.0, + "step": 8380 + }, + { + "epoch": 2.2663601946998377, + "grad_norm": 2.375, + "learning_rate": 0.01278914017645598, + "loss": 3.0491, + "mean_token_accuracy": 0.41576510667800903, + "num_tokens": 4284485351.0, + "step": 8381 + }, + { + "epoch": 2.2666306111411574, + "grad_norm": 2.71875, + "learning_rate": 0.012787595124282862, + "loss": 2.94, + "mean_token_accuracy": 0.404446542263031, + "num_tokens": 4285009610.0, + "step": 8382 + }, + { + "epoch": 2.266901027582477, + "grad_norm": 2.5, + "learning_rate": 0.012786050017261071, + "loss": 2.8874, + "mean_token_accuracy": 0.4151482582092285, + "num_tokens": 4285533838.0, + "step": 8383 + }, + { + "epoch": 2.2671714440237967, + "grad_norm": 2.921875, + "learning_rate": 0.01278450485543801, + "loss": 3.1105, + "mean_token_accuracy": 0.40803927183151245, + "num_tokens": 4286058038.0, + "step": 8384 + }, + { + "epoch": 2.2674418604651163, + "grad_norm": 3.765625, + "learning_rate": 0.012782959638861094, + "loss": 3.0794, + "mean_token_accuracy": 0.3943943381309509, + "num_tokens": 4286544285.0, + "step": 8385 + }, + { + "epoch": 2.267712276906436, + "grad_norm": 3.09375, + "learning_rate": 0.01278141436757773, + "loss": 3.1053, + "mean_token_accuracy": 0.4026063084602356, + "num_tokens": 4287063868.0, + "step": 8386 + }, + { + "epoch": 2.2679826933477556, + "grad_norm": 2.828125, + "learning_rate": 0.012779869041635339, + "loss": 3.0814, + "mean_token_accuracy": 0.3988656997680664, + "num_tokens": 4287537695.0, + "step": 8387 + }, + { + "epoch": 2.2682531097890752, + "grad_norm": 2.703125, + "learning_rate": 0.01277832366108133, + "loss": 3.0585, + "mean_token_accuracy": 0.41331106424331665, + "num_tokens": 4288061920.0, + "step": 8388 + }, + { + "epoch": 2.268523526230395, + "grad_norm": 2.78125, + "learning_rate": 0.012776778225963119, + "loss": 3.0327, + "mean_token_accuracy": 0.4228827953338623, + "num_tokens": 4288472098.0, + "step": 8389 + }, + { + "epoch": 2.2687939426717145, + "grad_norm": 2.78125, + "learning_rate": 0.012775232736328128, + "loss": 3.1144, + "mean_token_accuracy": 0.4157894253730774, + "num_tokens": 4288996263.0, + "step": 8390 + }, + { + "epoch": 2.269064359113034, + "grad_norm": 33.5, + "learning_rate": 0.012773687192223777, + "loss": 11.3356, + "mean_token_accuracy": 0.013594241812825203, + "num_tokens": 4289520389.0, + "step": 8391 + }, + { + "epoch": 2.269334775554354, + "grad_norm": 5.5625, + "learning_rate": 0.012772141593697486, + "loss": 3.4463, + "mean_token_accuracy": 0.3412610590457916, + "num_tokens": 4290044656.0, + "step": 8392 + }, + { + "epoch": 2.2696051919956735, + "grad_norm": 1.9609375, + "learning_rate": 0.012770595940796681, + "loss": 3.1824, + "mean_token_accuracy": 0.3925192356109619, + "num_tokens": 4290568847.0, + "step": 8393 + }, + { + "epoch": 2.269875608436993, + "grad_norm": 3.015625, + "learning_rate": 0.012769050233568786, + "loss": 3.2051, + "mean_token_accuracy": 0.39660847187042236, + "num_tokens": 4291093025.0, + "step": 8394 + }, + { + "epoch": 2.2701460248783127, + "grad_norm": 3.4375, + "learning_rate": 0.012767504472061226, + "loss": 3.3547, + "mean_token_accuracy": 0.3866364359855652, + "num_tokens": 4291617234.0, + "step": 8395 + }, + { + "epoch": 2.2704164413196324, + "grad_norm": 3.0625, + "learning_rate": 0.01276595865632143, + "loss": 3.1871, + "mean_token_accuracy": 0.393007755279541, + "num_tokens": 4292141446.0, + "step": 8396 + }, + { + "epoch": 2.270686857760952, + "grad_norm": 3.25, + "learning_rate": 0.01276441278639683, + "loss": 3.272, + "mean_token_accuracy": 0.39656031131744385, + "num_tokens": 4292641313.0, + "step": 8397 + }, + { + "epoch": 2.2709572742022717, + "grad_norm": 3.03125, + "learning_rate": 0.012762866862334856, + "loss": 3.1622, + "mean_token_accuracy": 0.3924434185028076, + "num_tokens": 4293130902.0, + "step": 8398 + }, + { + "epoch": 2.2712276906435913, + "grad_norm": 3.359375, + "learning_rate": 0.012761320884182943, + "loss": 3.4838, + "mean_token_accuracy": 0.3626898527145386, + "num_tokens": 4293638485.0, + "step": 8399 + }, + { + "epoch": 2.271498107084911, + "grad_norm": 3.515625, + "learning_rate": 0.012759774851988526, + "loss": 3.1704, + "mean_token_accuracy": 0.40334779024124146, + "num_tokens": 4294105601.0, + "step": 8400 + }, + { + "epoch": 2.27176852352623, + "grad_norm": 2.59375, + "learning_rate": 0.012758228765799042, + "loss": 2.9257, + "mean_token_accuracy": 0.3850514888763428, + "num_tokens": 4294629780.0, + "step": 8401 + }, + { + "epoch": 2.2720389399675502, + "grad_norm": 2.5625, + "learning_rate": 0.01275668262566193, + "loss": 3.0566, + "mean_token_accuracy": 0.40511125326156616, + "num_tokens": 4295153964.0, + "step": 8402 + }, + { + "epoch": 2.2723093564088694, + "grad_norm": 8.5625, + "learning_rate": 0.012755136431624625, + "loss": 3.0566, + "mean_token_accuracy": 0.4177882969379425, + "num_tokens": 4295678137.0, + "step": 8403 + }, + { + "epoch": 2.2725797728501895, + "grad_norm": 2.0625, + "learning_rate": 0.012753590183734578, + "loss": 3.0411, + "mean_token_accuracy": 0.40377652645111084, + "num_tokens": 4296202407.0, + "step": 8404 + }, + { + "epoch": 2.2728501892915087, + "grad_norm": 2.453125, + "learning_rate": 0.012752043882039222, + "loss": 3.0364, + "mean_token_accuracy": 0.4209303557872772, + "num_tokens": 4296579479.0, + "step": 8405 + }, + { + "epoch": 2.2731206057328284, + "grad_norm": 2.890625, + "learning_rate": 0.012750497526586007, + "loss": 3.1485, + "mean_token_accuracy": 0.4327769875526428, + "num_tokens": 4296957836.0, + "step": 8406 + }, + { + "epoch": 2.273391022174148, + "grad_norm": 3.296875, + "learning_rate": 0.012748951117422382, + "loss": 3.1224, + "mean_token_accuracy": 0.3968685269355774, + "num_tokens": 4297462102.0, + "step": 8407 + }, + { + "epoch": 2.2736614386154677, + "grad_norm": 2.78125, + "learning_rate": 0.012747404654595796, + "loss": 3.106, + "mean_token_accuracy": 0.3966156244277954, + "num_tokens": 4297986192.0, + "step": 8408 + }, + { + "epoch": 2.2739318550567873, + "grad_norm": 3.140625, + "learning_rate": 0.012745858138153694, + "loss": 3.0567, + "mean_token_accuracy": 0.3841283321380615, + "num_tokens": 4298510447.0, + "step": 8409 + }, + { + "epoch": 2.274202271498107, + "grad_norm": 2.8125, + "learning_rate": 0.01274431156814353, + "loss": 3.2018, + "mean_token_accuracy": 0.4011308550834656, + "num_tokens": 4299034526.0, + "step": 8410 + }, + { + "epoch": 2.2744726879394266, + "grad_norm": 5.8125, + "learning_rate": 0.012742764944612756, + "loss": 10.8244, + "mean_token_accuracy": 8.890348908607848e-06, + "num_tokens": 4299558759.0, + "step": 8411 + }, + { + "epoch": 2.2747431043807462, + "grad_norm": 7.9375, + "learning_rate": 0.012741218267608831, + "loss": 3.7021, + "mean_token_accuracy": 0.33121931552886963, + "num_tokens": 4300082969.0, + "step": 8412 + }, + { + "epoch": 2.275013520822066, + "grad_norm": 2.390625, + "learning_rate": 0.012739671537179207, + "loss": 3.35, + "mean_token_accuracy": 0.36348557472229004, + "num_tokens": 4300607183.0, + "step": 8413 + }, + { + "epoch": 2.2752839372633855, + "grad_norm": 2.390625, + "learning_rate": 0.012738124753371348, + "loss": 3.3229, + "mean_token_accuracy": 0.380386620759964, + "num_tokens": 4301131380.0, + "step": 8414 + }, + { + "epoch": 2.275554353704705, + "grad_norm": 3.109375, + "learning_rate": 0.012736577916232705, + "loss": 3.1789, + "mean_token_accuracy": 0.38301268219947815, + "num_tokens": 4301655659.0, + "step": 8415 + }, + { + "epoch": 2.275824770146025, + "grad_norm": 2.53125, + "learning_rate": 0.012735031025810747, + "loss": 3.2287, + "mean_token_accuracy": 0.4002230167388916, + "num_tokens": 4302179933.0, + "step": 8416 + }, + { + "epoch": 2.2760951865873444, + "grad_norm": 2.5, + "learning_rate": 0.012733484082152935, + "loss": 3.0917, + "mean_token_accuracy": 0.3982751667499542, + "num_tokens": 4302691815.0, + "step": 8417 + }, + { + "epoch": 2.276365603028664, + "grad_norm": 2.5, + "learning_rate": 0.012731937085306733, + "loss": 3.141, + "mean_token_accuracy": 0.40798187255859375, + "num_tokens": 4303213350.0, + "step": 8418 + }, + { + "epoch": 2.2766360194699837, + "grad_norm": 2.6875, + "learning_rate": 0.012730390035319606, + "loss": 3.1024, + "mean_token_accuracy": 0.4096795320510864, + "num_tokens": 4303737545.0, + "step": 8419 + }, + { + "epoch": 2.2769064359113034, + "grad_norm": 2.65625, + "learning_rate": 0.012728842932239026, + "loss": 3.0193, + "mean_token_accuracy": 0.43252459168434143, + "num_tokens": 4304261746.0, + "step": 8420 + }, + { + "epoch": 2.277176852352623, + "grad_norm": 2.9375, + "learning_rate": 0.012727295776112462, + "loss": 3.1347, + "mean_token_accuracy": 0.3959950804710388, + "num_tokens": 4304775928.0, + "step": 8421 + }, + { + "epoch": 2.2774472687939427, + "grad_norm": 16.625, + "learning_rate": 0.012725748566987381, + "loss": 2.9246, + "mean_token_accuracy": 0.47874704003334045, + "num_tokens": 4305209797.0, + "step": 8422 + }, + { + "epoch": 2.2777176852352623, + "grad_norm": 3.734375, + "learning_rate": 0.01272420130491126, + "loss": 3.1721, + "mean_token_accuracy": 0.37873029708862305, + "num_tokens": 4305733881.0, + "step": 8423 + }, + { + "epoch": 2.277988101676582, + "grad_norm": 2.390625, + "learning_rate": 0.012722653989931575, + "loss": 2.8807, + "mean_token_accuracy": 0.41561001539230347, + "num_tokens": 4306258025.0, + "step": 8424 + }, + { + "epoch": 2.2782585181179016, + "grad_norm": 3.203125, + "learning_rate": 0.012721106622095798, + "loss": 3.23, + "mean_token_accuracy": 0.3824215531349182, + "num_tokens": 4306782308.0, + "step": 8425 + }, + { + "epoch": 2.2785289345592212, + "grad_norm": 7.0, + "learning_rate": 0.012719559201451404, + "loss": 2.9099, + "mean_token_accuracy": 0.4341108500957489, + "num_tokens": 4307306477.0, + "step": 8426 + }, + { + "epoch": 2.278799351000541, + "grad_norm": 1.96875, + "learning_rate": 0.012718011728045882, + "loss": 3.0656, + "mean_token_accuracy": 0.3993667960166931, + "num_tokens": 4307818320.0, + "step": 8427 + }, + { + "epoch": 2.2790697674418605, + "grad_norm": 4.28125, + "learning_rate": 0.012716464201926708, + "loss": 3.3684, + "mean_token_accuracy": 0.37107616662979126, + "num_tokens": 4308342502.0, + "step": 8428 + }, + { + "epoch": 2.27934018388318, + "grad_norm": 2.59375, + "learning_rate": 0.012714916623141361, + "loss": 3.1926, + "mean_token_accuracy": 0.3738923668861389, + "num_tokens": 4308866559.0, + "step": 8429 + }, + { + "epoch": 2.2796106003245, + "grad_norm": 2.921875, + "learning_rate": 0.012713368991737334, + "loss": 3.1049, + "mean_token_accuracy": 0.4038980305194855, + "num_tokens": 4309390576.0, + "step": 8430 + }, + { + "epoch": 2.2798810167658194, + "grad_norm": 4.0, + "learning_rate": 0.012711821307762102, + "loss": 9.1233, + "mean_token_accuracy": 0.007118752226233482, + "num_tokens": 4309914692.0, + "step": 8431 + }, + { + "epoch": 2.280151433207139, + "grad_norm": 7.03125, + "learning_rate": 0.01271027357126316, + "loss": 3.6567, + "mean_token_accuracy": 0.3600860834121704, + "num_tokens": 4310438973.0, + "step": 8432 + }, + { + "epoch": 2.2804218496484587, + "grad_norm": 2.0, + "learning_rate": 0.012708725782287995, + "loss": 3.0602, + "mean_token_accuracy": 0.39873725175857544, + "num_tokens": 4310963186.0, + "step": 8433 + }, + { + "epoch": 2.2806922660897784, + "grad_norm": 2.625, + "learning_rate": 0.0127071779408841, + "loss": 3.2491, + "mean_token_accuracy": 0.39820632338523865, + "num_tokens": 4311487462.0, + "step": 8434 + }, + { + "epoch": 2.280962682531098, + "grad_norm": 4.4375, + "learning_rate": 0.012705630047098961, + "loss": 3.3583, + "mean_token_accuracy": 0.3552238345146179, + "num_tokens": 4312011620.0, + "step": 8435 + }, + { + "epoch": 2.2812330989724177, + "grad_norm": 2.546875, + "learning_rate": 0.012704082100980079, + "loss": 3.3618, + "mean_token_accuracy": 0.38774722814559937, + "num_tokens": 4312535838.0, + "step": 8436 + }, + { + "epoch": 2.2815035154137373, + "grad_norm": 3.890625, + "learning_rate": 0.01270253410257495, + "loss": 3.2639, + "mean_token_accuracy": 0.380265474319458, + "num_tokens": 4313060035.0, + "step": 8437 + }, + { + "epoch": 2.281773931855057, + "grad_norm": 3.40625, + "learning_rate": 0.012700986051931065, + "loss": 3.4027, + "mean_token_accuracy": 0.36157146096229553, + "num_tokens": 4313584299.0, + "step": 8438 + }, + { + "epoch": 2.2820443482963766, + "grad_norm": 3.65625, + "learning_rate": 0.012699437949095924, + "loss": 3.0699, + "mean_token_accuracy": 0.39547351002693176, + "num_tokens": 4314108579.0, + "step": 8439 + }, + { + "epoch": 2.2823147647376962, + "grad_norm": 2.921875, + "learning_rate": 0.012697889794117033, + "loss": 3.2084, + "mean_token_accuracy": 0.4000847339630127, + "num_tokens": 4314632834.0, + "step": 8440 + }, + { + "epoch": 2.282585181179016, + "grad_norm": 4.03125, + "learning_rate": 0.012696341587041884, + "loss": 3.0555, + "mean_token_accuracy": 0.3800390064716339, + "num_tokens": 4315157035.0, + "step": 8441 + }, + { + "epoch": 2.282855597620335, + "grad_norm": 2.890625, + "learning_rate": 0.012694793327917989, + "loss": 3.1189, + "mean_token_accuracy": 0.39374446868896484, + "num_tokens": 4315681309.0, + "step": 8442 + }, + { + "epoch": 2.283126014061655, + "grad_norm": 3.71875, + "learning_rate": 0.012693245016792856, + "loss": 2.9661, + "mean_token_accuracy": 0.4607192873954773, + "num_tokens": 4316205399.0, + "step": 8443 + }, + { + "epoch": 2.2833964305029744, + "grad_norm": 1.9609375, + "learning_rate": 0.01269169665371398, + "loss": 2.9971, + "mean_token_accuracy": 0.41073381900787354, + "num_tokens": 4316729514.0, + "step": 8444 + }, + { + "epoch": 2.2836668469442944, + "grad_norm": 2.9375, + "learning_rate": 0.012690148238728875, + "loss": 3.3248, + "mean_token_accuracy": 0.3864424228668213, + "num_tokens": 4317253690.0, + "step": 8445 + }, + { + "epoch": 2.2839372633856136, + "grad_norm": 2.9375, + "learning_rate": 0.012688599771885056, + "loss": 3.2912, + "mean_token_accuracy": 0.3642207980155945, + "num_tokens": 4317777966.0, + "step": 8446 + }, + { + "epoch": 2.2842076798269333, + "grad_norm": 2.78125, + "learning_rate": 0.012687051253230028, + "loss": 2.9973, + "mean_token_accuracy": 0.3651232421398163, + "num_tokens": 4318302148.0, + "step": 8447 + }, + { + "epoch": 2.284478096268253, + "grad_norm": 2.671875, + "learning_rate": 0.012685502682811305, + "loss": 3.1186, + "mean_token_accuracy": 0.40388548374176025, + "num_tokens": 4318820245.0, + "step": 8448 + }, + { + "epoch": 2.2847485127095726, + "grad_norm": 3.125, + "learning_rate": 0.012683954060676406, + "loss": 3.1362, + "mean_token_accuracy": 0.38668292760849, + "num_tokens": 4319344468.0, + "step": 8449 + }, + { + "epoch": 2.285018929150892, + "grad_norm": 2.84375, + "learning_rate": 0.012682405386872844, + "loss": 3.1447, + "mean_token_accuracy": 0.4034518003463745, + "num_tokens": 4319868745.0, + "step": 8450 + }, + { + "epoch": 2.285289345592212, + "grad_norm": 55.25, + "learning_rate": 0.012680856661448135, + "loss": 16.93, + "mean_token_accuracy": 0.012074227444827557, + "num_tokens": 4320393009.0, + "step": 8451 + }, + { + "epoch": 2.2855597620335315, + "grad_norm": 7.625, + "learning_rate": 0.012679307884449802, + "loss": 3.3292, + "mean_token_accuracy": 0.328549861907959, + "num_tokens": 4320917110.0, + "step": 8452 + }, + { + "epoch": 2.285830178474851, + "grad_norm": 2.21875, + "learning_rate": 0.012677759055925367, + "loss": 2.9477, + "mean_token_accuracy": 0.38126838207244873, + "num_tokens": 4321441347.0, + "step": 8453 + }, + { + "epoch": 2.286100594916171, + "grad_norm": 2.125, + "learning_rate": 0.012676210175922345, + "loss": 3.1349, + "mean_token_accuracy": 0.4082266688346863, + "num_tokens": 4321965612.0, + "step": 8454 + }, + { + "epoch": 2.2863710113574904, + "grad_norm": 2.765625, + "learning_rate": 0.012674661244488268, + "loss": 3.2671, + "mean_token_accuracy": 0.39640647172927856, + "num_tokens": 4322484639.0, + "step": 8455 + }, + { + "epoch": 2.28664142779881, + "grad_norm": 2.765625, + "learning_rate": 0.012673112261670656, + "loss": 3.2551, + "mean_token_accuracy": 0.38598868250846863, + "num_tokens": 4323008878.0, + "step": 8456 + }, + { + "epoch": 2.2869118442401297, + "grad_norm": 3.421875, + "learning_rate": 0.012671563227517042, + "loss": 3.3797, + "mean_token_accuracy": 0.3884391188621521, + "num_tokens": 4323533161.0, + "step": 8457 + }, + { + "epoch": 2.2871822606814494, + "grad_norm": 2.453125, + "learning_rate": 0.012670014142074955, + "loss": 2.8277, + "mean_token_accuracy": 0.3678966462612152, + "num_tokens": 4324057292.0, + "step": 8458 + }, + { + "epoch": 2.287452677122769, + "grad_norm": 2.75, + "learning_rate": 0.01266846500539192, + "loss": 3.2368, + "mean_token_accuracy": 0.38951775431632996, + "num_tokens": 4324581466.0, + "step": 8459 + }, + { + "epoch": 2.2877230935640886, + "grad_norm": 4.40625, + "learning_rate": 0.012666915817515475, + "loss": 3.1584, + "mean_token_accuracy": 0.3866860568523407, + "num_tokens": 4325105714.0, + "step": 8460 + }, + { + "epoch": 2.2879935100054083, + "grad_norm": 2.921875, + "learning_rate": 0.012665366578493147, + "loss": 3.4086, + "mean_token_accuracy": 0.3921666741371155, + "num_tokens": 4325567144.0, + "step": 8461 + }, + { + "epoch": 2.288263926446728, + "grad_norm": 3.03125, + "learning_rate": 0.012663817288372476, + "loss": 3.1202, + "mean_token_accuracy": 0.41323989629745483, + "num_tokens": 4326091238.0, + "step": 8462 + }, + { + "epoch": 2.2885343428880476, + "grad_norm": 3.296875, + "learning_rate": 0.012662267947200998, + "loss": 3.0426, + "mean_token_accuracy": 0.4038006663322449, + "num_tokens": 4326615317.0, + "step": 8463 + }, + { + "epoch": 2.2888047593293672, + "grad_norm": 2.421875, + "learning_rate": 0.012660718555026249, + "loss": 3.0907, + "mean_token_accuracy": 0.40918779373168945, + "num_tokens": 4327139529.0, + "step": 8464 + }, + { + "epoch": 2.289075175770687, + "grad_norm": 2.828125, + "learning_rate": 0.012659169111895772, + "loss": 3.1349, + "mean_token_accuracy": 0.3971797227859497, + "num_tokens": 4327661430.0, + "step": 8465 + }, + { + "epoch": 2.2893455922120065, + "grad_norm": 2.1875, + "learning_rate": 0.01265761961785711, + "loss": 2.9681, + "mean_token_accuracy": 0.42509013414382935, + "num_tokens": 4328158964.0, + "step": 8466 + }, + { + "epoch": 2.289616008653326, + "grad_norm": 2.625, + "learning_rate": 0.012656070072957801, + "loss": 3.2217, + "mean_token_accuracy": 0.3957277238368988, + "num_tokens": 4328683240.0, + "step": 8467 + }, + { + "epoch": 2.289886425094646, + "grad_norm": 2.921875, + "learning_rate": 0.012654520477245393, + "loss": 3.368, + "mean_token_accuracy": 0.38641151785850525, + "num_tokens": 4329196597.0, + "step": 8468 + }, + { + "epoch": 2.2901568415359654, + "grad_norm": 2.859375, + "learning_rate": 0.01265297083076743, + "loss": 2.844, + "mean_token_accuracy": 0.43262848258018494, + "num_tokens": 4329668383.0, + "step": 8469 + }, + { + "epoch": 2.290427257977285, + "grad_norm": 2.671875, + "learning_rate": 0.01265142113357146, + "loss": 2.8358, + "mean_token_accuracy": 0.4684469997882843, + "num_tokens": 4330174028.0, + "step": 8470 + }, + { + "epoch": 2.2906976744186047, + "grad_norm": 20.375, + "learning_rate": 0.012649871385705036, + "loss": 9.7679, + "mean_token_accuracy": 0.0042327577248215675, + "num_tokens": 4330635530.0, + "step": 8471 + }, + { + "epoch": 2.2909680908599244, + "grad_norm": 5.96875, + "learning_rate": 0.012648321587215704, + "loss": 3.4772, + "mean_token_accuracy": 0.355383038520813, + "num_tokens": 4331159719.0, + "step": 8472 + }, + { + "epoch": 2.291238507301244, + "grad_norm": 2.203125, + "learning_rate": 0.012646771738151019, + "loss": 3.3844, + "mean_token_accuracy": 0.38809072971343994, + "num_tokens": 4331664146.0, + "step": 8473 + }, + { + "epoch": 2.2915089237425637, + "grad_norm": 3.40625, + "learning_rate": 0.012645221838558533, + "loss": 3.3275, + "mean_token_accuracy": 0.3789256811141968, + "num_tokens": 4332165044.0, + "step": 8474 + }, + { + "epoch": 2.2917793401838833, + "grad_norm": 3.078125, + "learning_rate": 0.012643671888485804, + "loss": 3.256, + "mean_token_accuracy": 0.40251386165618896, + "num_tokens": 4332689234.0, + "step": 8475 + }, + { + "epoch": 2.292049756625203, + "grad_norm": 3.078125, + "learning_rate": 0.012642121887980386, + "loss": 3.1484, + "mean_token_accuracy": 0.39321595430374146, + "num_tokens": 4333213490.0, + "step": 8476 + }, + { + "epoch": 2.2923201730665226, + "grad_norm": 2.765625, + "learning_rate": 0.01264057183708984, + "loss": 3.1668, + "mean_token_accuracy": 0.38969942927360535, + "num_tokens": 4333737517.0, + "step": 8477 + }, + { + "epoch": 2.2925905895078422, + "grad_norm": 2.578125, + "learning_rate": 0.012639021735861726, + "loss": 2.9865, + "mean_token_accuracy": 0.4366218149662018, + "num_tokens": 4334256988.0, + "step": 8478 + }, + { + "epoch": 2.292861005949162, + "grad_norm": 2.515625, + "learning_rate": 0.012637471584343607, + "loss": 2.9731, + "mean_token_accuracy": 0.39396941661834717, + "num_tokens": 4334781231.0, + "step": 8479 + }, + { + "epoch": 2.2931314223904815, + "grad_norm": 2.75, + "learning_rate": 0.012635921382583045, + "loss": 3.1267, + "mean_token_accuracy": 0.3919209837913513, + "num_tokens": 4335305498.0, + "step": 8480 + }, + { + "epoch": 2.293401838831801, + "grad_norm": 2.375, + "learning_rate": 0.0126343711306276, + "loss": 3.1729, + "mean_token_accuracy": 0.3820672333240509, + "num_tokens": 4335829739.0, + "step": 8481 + }, + { + "epoch": 2.293672255273121, + "grad_norm": 3.5, + "learning_rate": 0.012632820828524842, + "loss": 3.1183, + "mean_token_accuracy": 0.38708972930908203, + "num_tokens": 4336353864.0, + "step": 8482 + }, + { + "epoch": 2.29394267171444, + "grad_norm": 3.390625, + "learning_rate": 0.012631270476322341, + "loss": 3.2984, + "mean_token_accuracy": 0.38377809524536133, + "num_tokens": 4336878049.0, + "step": 8483 + }, + { + "epoch": 2.29421308815576, + "grad_norm": 2.421875, + "learning_rate": 0.012629720074067666, + "loss": 2.9905, + "mean_token_accuracy": 0.40961557626724243, + "num_tokens": 4337402190.0, + "step": 8484 + }, + { + "epoch": 2.2944835045970793, + "grad_norm": 3.109375, + "learning_rate": 0.012628169621808382, + "loss": 3.3013, + "mean_token_accuracy": 0.34783700108528137, + "num_tokens": 4337926400.0, + "step": 8485 + }, + { + "epoch": 2.2947539210383994, + "grad_norm": 2.609375, + "learning_rate": 0.012626619119592073, + "loss": 2.9571, + "mean_token_accuracy": 0.40409788489341736, + "num_tokens": 4338450556.0, + "step": 8486 + }, + { + "epoch": 2.2950243374797186, + "grad_norm": 3.65625, + "learning_rate": 0.0126250685674663, + "loss": 3.1819, + "mean_token_accuracy": 0.3858526349067688, + "num_tokens": 4338974715.0, + "step": 8487 + }, + { + "epoch": 2.295294753921038, + "grad_norm": 2.703125, + "learning_rate": 0.012623517965478646, + "loss": 3.1881, + "mean_token_accuracy": 0.40724536776542664, + "num_tokens": 4339498929.0, + "step": 8488 + }, + { + "epoch": 2.295565170362358, + "grad_norm": 2.734375, + "learning_rate": 0.012621967313676688, + "loss": 3.072, + "mean_token_accuracy": 0.4190627336502075, + "num_tokens": 4339900679.0, + "step": 8489 + }, + { + "epoch": 2.2958355868036775, + "grad_norm": 2.921875, + "learning_rate": 0.012620416612108, + "loss": 3.2499, + "mean_token_accuracy": 0.40630042552948, + "num_tokens": 4340386111.0, + "step": 8490 + }, + { + "epoch": 2.296106003244997, + "grad_norm": 135.0, + "learning_rate": 0.012618865860820165, + "loss": 14.4962, + "mean_token_accuracy": 0.03869876638054848, + "num_tokens": 4340910318.0, + "step": 8491 + }, + { + "epoch": 2.296376419686317, + "grad_norm": 8.75, + "learning_rate": 0.012617315059860766, + "loss": 3.7744, + "mean_token_accuracy": 0.3110869526863098, + "num_tokens": 4341434489.0, + "step": 8492 + }, + { + "epoch": 2.2966468361276364, + "grad_norm": 2.8125, + "learning_rate": 0.012615764209277387, + "loss": 3.3651, + "mean_token_accuracy": 0.3909061551094055, + "num_tokens": 4341936029.0, + "step": 8493 + }, + { + "epoch": 2.296917252568956, + "grad_norm": 3.046875, + "learning_rate": 0.012614213309117607, + "loss": 3.2415, + "mean_token_accuracy": 0.38688236474990845, + "num_tokens": 4342460292.0, + "step": 8494 + }, + { + "epoch": 2.2971876690102757, + "grad_norm": 4.1875, + "learning_rate": 0.012612662359429015, + "loss": 3.1278, + "mean_token_accuracy": 0.3983023166656494, + "num_tokens": 4342984480.0, + "step": 8495 + }, + { + "epoch": 2.2974580854515954, + "grad_norm": 3.046875, + "learning_rate": 0.012611111360259204, + "loss": 3.108, + "mean_token_accuracy": 0.38104984164237976, + "num_tokens": 4343508637.0, + "step": 8496 + }, + { + "epoch": 2.297728501892915, + "grad_norm": 2.46875, + "learning_rate": 0.012609560311655754, + "loss": 3.2311, + "mean_token_accuracy": 0.39649370312690735, + "num_tokens": 4344032584.0, + "step": 8497 + }, + { + "epoch": 2.2979989183342346, + "grad_norm": 2.890625, + "learning_rate": 0.01260800921366626, + "loss": 3.1244, + "mean_token_accuracy": 0.3994649648666382, + "num_tokens": 4344521009.0, + "step": 8498 + }, + { + "epoch": 2.2982693347755543, + "grad_norm": 2.625, + "learning_rate": 0.012606458066338318, + "loss": 3.1635, + "mean_token_accuracy": 0.40056076645851135, + "num_tokens": 4345045187.0, + "step": 8499 + }, + { + "epoch": 2.298539751216874, + "grad_norm": 3.109375, + "learning_rate": 0.012604906869719518, + "loss": 3.3312, + "mean_token_accuracy": 0.39532190561294556, + "num_tokens": 4345482030.0, + "step": 8500 + }, + { + "epoch": 2.2988101676581936, + "grad_norm": 2.734375, + "learning_rate": 0.012603355623857454, + "loss": 3.292, + "mean_token_accuracy": 0.39809513092041016, + "num_tokens": 4346006225.0, + "step": 8501 + }, + { + "epoch": 2.299080584099513, + "grad_norm": 2.953125, + "learning_rate": 0.012601804328799721, + "loss": 3.2293, + "mean_token_accuracy": 0.3937428295612335, + "num_tokens": 4346530500.0, + "step": 8502 + }, + { + "epoch": 2.299351000540833, + "grad_norm": 3.015625, + "learning_rate": 0.012600252984593925, + "loss": 3.277, + "mean_token_accuracy": 0.3686065077781677, + "num_tokens": 4347054755.0, + "step": 8503 + }, + { + "epoch": 2.2996214169821525, + "grad_norm": 2.671875, + "learning_rate": 0.012598701591287662, + "loss": 3.0892, + "mean_token_accuracy": 0.40904784202575684, + "num_tokens": 4347578928.0, + "step": 8504 + }, + { + "epoch": 2.299891833423472, + "grad_norm": 3.15625, + "learning_rate": 0.012597150148928529, + "loss": 3.413, + "mean_token_accuracy": 0.3881133794784546, + "num_tokens": 4348103166.0, + "step": 8505 + }, + { + "epoch": 2.300162249864792, + "grad_norm": 2.984375, + "learning_rate": 0.012595598657564136, + "loss": 3.2455, + "mean_token_accuracy": 0.394584059715271, + "num_tokens": 4348627327.0, + "step": 8506 + }, + { + "epoch": 2.3004326663061114, + "grad_norm": 2.484375, + "learning_rate": 0.01259404711724208, + "loss": 3.1055, + "mean_token_accuracy": 0.40487438440322876, + "num_tokens": 4349151609.0, + "step": 8507 + }, + { + "epoch": 2.300703082747431, + "grad_norm": 2.46875, + "learning_rate": 0.012592495528009973, + "loss": 3.1346, + "mean_token_accuracy": 0.38333845138549805, + "num_tokens": 4349675738.0, + "step": 8508 + }, + { + "epoch": 2.3009734991887507, + "grad_norm": 2.15625, + "learning_rate": 0.012590943889915423, + "loss": 3.2429, + "mean_token_accuracy": 0.3901433050632477, + "num_tokens": 4350200013.0, + "step": 8509 + }, + { + "epoch": 2.3012439156300704, + "grad_norm": 3.78125, + "learning_rate": 0.012589392203006032, + "loss": 3.2533, + "mean_token_accuracy": 0.35789477825164795, + "num_tokens": 4350724060.0, + "step": 8510 + }, + { + "epoch": 2.30151433207139, + "grad_norm": 55.0, + "learning_rate": 0.012587840467329414, + "loss": 12.4342, + "mean_token_accuracy": 0.01835213042795658, + "num_tokens": 4351220145.0, + "step": 8511 + }, + { + "epoch": 2.3017847485127096, + "grad_norm": 7.21875, + "learning_rate": 0.01258628868293318, + "loss": 3.6132, + "mean_token_accuracy": 0.3477609157562256, + "num_tokens": 4351744419.0, + "step": 8512 + }, + { + "epoch": 2.3020551649540293, + "grad_norm": 2.421875, + "learning_rate": 0.012584736849864948, + "loss": 3.3544, + "mean_token_accuracy": 0.40926212072372437, + "num_tokens": 4352133476.0, + "step": 8513 + }, + { + "epoch": 2.302325581395349, + "grad_norm": 2.65625, + "learning_rate": 0.012583184968172326, + "loss": 3.2609, + "mean_token_accuracy": 0.38818660378456116, + "num_tokens": 4352657747.0, + "step": 8514 + }, + { + "epoch": 2.3025959978366686, + "grad_norm": 2.484375, + "learning_rate": 0.012581633037902933, + "loss": 3.015, + "mean_token_accuracy": 0.4215041399002075, + "num_tokens": 4353119657.0, + "step": 8515 + }, + { + "epoch": 2.302866414277988, + "grad_norm": 1.9609375, + "learning_rate": 0.012580081059104391, + "loss": 3.1784, + "mean_token_accuracy": 0.4035364091396332, + "num_tokens": 4353545961.0, + "step": 8516 + }, + { + "epoch": 2.303136830719308, + "grad_norm": 2.328125, + "learning_rate": 0.012578529031824312, + "loss": 3.1454, + "mean_token_accuracy": 0.401616632938385, + "num_tokens": 4354011425.0, + "step": 8517 + }, + { + "epoch": 2.3034072471606275, + "grad_norm": 3.390625, + "learning_rate": 0.012576976956110321, + "loss": 3.1916, + "mean_token_accuracy": 0.3913242220878601, + "num_tokens": 4354535523.0, + "step": 8518 + }, + { + "epoch": 2.303677663601947, + "grad_norm": 2.875, + "learning_rate": 0.012575424832010039, + "loss": 3.0761, + "mean_token_accuracy": 0.3954973518848419, + "num_tokens": 4355021115.0, + "step": 8519 + }, + { + "epoch": 2.303948080043267, + "grad_norm": 3.484375, + "learning_rate": 0.01257387265957109, + "loss": 3.0866, + "mean_token_accuracy": 0.4064875543117523, + "num_tokens": 4355516397.0, + "step": 8520 + }, + { + "epoch": 2.3042184964845864, + "grad_norm": 2.65625, + "learning_rate": 0.0125723204388411, + "loss": 2.7928, + "mean_token_accuracy": 0.4513387680053711, + "num_tokens": 4356040679.0, + "step": 8521 + }, + { + "epoch": 2.304488912925906, + "grad_norm": 2.515625, + "learning_rate": 0.012570768169867698, + "loss": 3.1113, + "mean_token_accuracy": 0.3967558741569519, + "num_tokens": 4356564900.0, + "step": 8522 + }, + { + "epoch": 2.3047593293672257, + "grad_norm": 3.390625, + "learning_rate": 0.012569215852698504, + "loss": 3.3013, + "mean_token_accuracy": 0.39335379004478455, + "num_tokens": 4357089138.0, + "step": 8523 + }, + { + "epoch": 2.305029745808545, + "grad_norm": 2.484375, + "learning_rate": 0.012567663487381154, + "loss": 3.222, + "mean_token_accuracy": 0.38207852840423584, + "num_tokens": 4357612114.0, + "step": 8524 + }, + { + "epoch": 2.305300162249865, + "grad_norm": 2.78125, + "learning_rate": 0.01256611107396328, + "loss": 3.1046, + "mean_token_accuracy": 0.40945640206336975, + "num_tokens": 4358136395.0, + "step": 8525 + }, + { + "epoch": 2.305570578691184, + "grad_norm": 2.375, + "learning_rate": 0.012564558612492512, + "loss": 3.0756, + "mean_token_accuracy": 0.40735870599746704, + "num_tokens": 4358660666.0, + "step": 8526 + }, + { + "epoch": 2.3058409951325043, + "grad_norm": 3.5625, + "learning_rate": 0.012563006103016484, + "loss": 3.1967, + "mean_token_accuracy": 0.3906812071800232, + "num_tokens": 4359184829.0, + "step": 8527 + }, + { + "epoch": 2.3061114115738235, + "grad_norm": 3.046875, + "learning_rate": 0.01256145354558283, + "loss": 3.0961, + "mean_token_accuracy": 0.40414005517959595, + "num_tokens": 4359709104.0, + "step": 8528 + }, + { + "epoch": 2.306381828015143, + "grad_norm": 9.5625, + "learning_rate": 0.012559900940239194, + "loss": 3.1804, + "mean_token_accuracy": 0.4048752188682556, + "num_tokens": 4360233259.0, + "step": 8529 + }, + { + "epoch": 2.3066522444564628, + "grad_norm": 2.109375, + "learning_rate": 0.01255834828703321, + "loss": 3.3778, + "mean_token_accuracy": 0.38241875171661377, + "num_tokens": 4360757476.0, + "step": 8530 + }, + { + "epoch": 2.3069226608977824, + "grad_norm": 140.0, + "learning_rate": 0.012556795586012512, + "loss": 18.5532, + "mean_token_accuracy": 0.0, + "num_tokens": 4361247256.0, + "step": 8531 + }, + { + "epoch": 2.307193077339102, + "grad_norm": 6.09375, + "learning_rate": 0.012555242837224754, + "loss": 3.5834, + "mean_token_accuracy": 0.36587586998939514, + "num_tokens": 4361771506.0, + "step": 8532 + }, + { + "epoch": 2.3074634937804217, + "grad_norm": 2.140625, + "learning_rate": 0.012553690040717567, + "loss": 3.3107, + "mean_token_accuracy": 0.3899717926979065, + "num_tokens": 4362295696.0, + "step": 8533 + }, + { + "epoch": 2.3077339102217413, + "grad_norm": 2.484375, + "learning_rate": 0.012552137196538603, + "loss": 3.2486, + "mean_token_accuracy": 0.38833677768707275, + "num_tokens": 4362819935.0, + "step": 8534 + }, + { + "epoch": 2.308004326663061, + "grad_norm": 3.78125, + "learning_rate": 0.012550584304735504, + "loss": 3.2975, + "mean_token_accuracy": 0.3829471170902252, + "num_tokens": 4363344067.0, + "step": 8535 + }, + { + "epoch": 2.3082747431043806, + "grad_norm": 2.609375, + "learning_rate": 0.012549031365355918, + "loss": 3.1307, + "mean_token_accuracy": 0.41767752170562744, + "num_tokens": 4363818910.0, + "step": 8536 + }, + { + "epoch": 2.3085451595457003, + "grad_norm": 2.53125, + "learning_rate": 0.012547478378447496, + "loss": 2.9462, + "mean_token_accuracy": 0.40427547693252563, + "num_tokens": 4364343070.0, + "step": 8537 + }, + { + "epoch": 2.30881557598702, + "grad_norm": 2.921875, + "learning_rate": 0.012545925344057888, + "loss": 2.8616, + "mean_token_accuracy": 0.414652556180954, + "num_tokens": 4364846135.0, + "step": 8538 + }, + { + "epoch": 2.3090859924283396, + "grad_norm": 2.453125, + "learning_rate": 0.012544372262234742, + "loss": 3.0415, + "mean_token_accuracy": 0.39857521653175354, + "num_tokens": 4365370388.0, + "step": 8539 + }, + { + "epoch": 2.309356408869659, + "grad_norm": 2.609375, + "learning_rate": 0.012542819133025712, + "loss": 3.0881, + "mean_token_accuracy": 0.4012833535671234, + "num_tokens": 4365894648.0, + "step": 8540 + }, + { + "epoch": 2.309626825310979, + "grad_norm": 3.046875, + "learning_rate": 0.012541265956478451, + "loss": 3.2229, + "mean_token_accuracy": 0.4022625684738159, + "num_tokens": 4366418908.0, + "step": 8541 + }, + { + "epoch": 2.3098972417522985, + "grad_norm": 3.375, + "learning_rate": 0.012539712732640625, + "loss": 3.2671, + "mean_token_accuracy": 0.41133198142051697, + "num_tokens": 4366883929.0, + "step": 8542 + }, + { + "epoch": 2.310167658193618, + "grad_norm": 3.65625, + "learning_rate": 0.01253815946155988, + "loss": 3.3419, + "mean_token_accuracy": 0.38961562514305115, + "num_tokens": 4367408173.0, + "step": 8543 + }, + { + "epoch": 2.3104380746349378, + "grad_norm": 3.96875, + "learning_rate": 0.012536606143283882, + "loss": 3.3241, + "mean_token_accuracy": 0.3653174638748169, + "num_tokens": 4367932349.0, + "step": 8544 + }, + { + "epoch": 2.3107084910762574, + "grad_norm": 2.875, + "learning_rate": 0.012535052777860288, + "loss": 3.1505, + "mean_token_accuracy": 0.38524043560028076, + "num_tokens": 4368456504.0, + "step": 8545 + }, + { + "epoch": 2.310978907517577, + "grad_norm": 2.328125, + "learning_rate": 0.012533499365336761, + "loss": 3.0743, + "mean_token_accuracy": 0.4129561185836792, + "num_tokens": 4368920127.0, + "step": 8546 + }, + { + "epoch": 2.3112493239588967, + "grad_norm": 2.4375, + "learning_rate": 0.01253194590576096, + "loss": 3.303, + "mean_token_accuracy": 0.39003539085388184, + "num_tokens": 4369444383.0, + "step": 8547 + }, + { + "epoch": 2.3115197404002163, + "grad_norm": 3.828125, + "learning_rate": 0.012530392399180555, + "loss": 3.2797, + "mean_token_accuracy": 0.3867994546890259, + "num_tokens": 4369968632.0, + "step": 8548 + }, + { + "epoch": 2.311790156841536, + "grad_norm": 2.65625, + "learning_rate": 0.012528838845643214, + "loss": 3.0461, + "mean_token_accuracy": 0.42085325717926025, + "num_tokens": 4370442218.0, + "step": 8549 + }, + { + "epoch": 2.3120605732828556, + "grad_norm": 3.25, + "learning_rate": 0.012527285245196598, + "loss": 3.0091, + "mean_token_accuracy": 0.4197566509246826, + "num_tokens": 4370911429.0, + "step": 8550 + }, + { + "epoch": 2.3123309897241753, + "grad_norm": 25.375, + "learning_rate": 0.01252573159788838, + "loss": 10.4264, + "mean_token_accuracy": 0.005014799535274506, + "num_tokens": 4371435691.0, + "step": 8551 + }, + { + "epoch": 2.312601406165495, + "grad_norm": 32.75, + "learning_rate": 0.012524177903766231, + "loss": 3.4807, + "mean_token_accuracy": 0.3996308147907257, + "num_tokens": 4371940611.0, + "step": 8552 + }, + { + "epoch": 2.3128718226068146, + "grad_norm": 4.53125, + "learning_rate": 0.01252262416287782, + "loss": 3.5321, + "mean_token_accuracy": 0.36429041624069214, + "num_tokens": 4372456990.0, + "step": 8553 + }, + { + "epoch": 2.313142239048134, + "grad_norm": 2.3125, + "learning_rate": 0.012521070375270822, + "loss": 3.1561, + "mean_token_accuracy": 0.3961297273635864, + "num_tokens": 4372908425.0, + "step": 8554 + }, + { + "epoch": 2.313412655489454, + "grad_norm": 2.828125, + "learning_rate": 0.012519516540992912, + "loss": 2.7262, + "mean_token_accuracy": 0.4327787756919861, + "num_tokens": 4373432553.0, + "step": 8555 + }, + { + "epoch": 2.3136830719307735, + "grad_norm": 2.359375, + "learning_rate": 0.012517962660091767, + "loss": 3.2584, + "mean_token_accuracy": 0.3732324242591858, + "num_tokens": 4373956800.0, + "step": 8556 + }, + { + "epoch": 2.313953488372093, + "grad_norm": 2.828125, + "learning_rate": 0.012516408732615059, + "loss": 3.0758, + "mean_token_accuracy": 0.39962923526763916, + "num_tokens": 4374433118.0, + "step": 8557 + }, + { + "epoch": 2.314223904813413, + "grad_norm": 3.046875, + "learning_rate": 0.012514854758610479, + "loss": 3.1581, + "mean_token_accuracy": 0.3915891647338867, + "num_tokens": 4374947328.0, + "step": 8558 + }, + { + "epoch": 2.3144943212547324, + "grad_norm": 2.828125, + "learning_rate": 0.012513300738125694, + "loss": 3.1613, + "mean_token_accuracy": 0.3624573349952698, + "num_tokens": 4375471473.0, + "step": 8559 + }, + { + "epoch": 2.314764737696052, + "grad_norm": 3.21875, + "learning_rate": 0.012511746671208393, + "loss": 3.3792, + "mean_token_accuracy": 0.37591904401779175, + "num_tokens": 4375995675.0, + "step": 8560 + }, + { + "epoch": 2.3150351541373717, + "grad_norm": 3.328125, + "learning_rate": 0.012510192557906261, + "loss": 3.2681, + "mean_token_accuracy": 0.37547969818115234, + "num_tokens": 4376519917.0, + "step": 8561 + }, + { + "epoch": 2.3153055705786914, + "grad_norm": 2.890625, + "learning_rate": 0.012508638398266977, + "loss": 3.3365, + "mean_token_accuracy": 0.40683355927467346, + "num_tokens": 4377044052.0, + "step": 8562 + }, + { + "epoch": 2.315575987020011, + "grad_norm": 4.1875, + "learning_rate": 0.012507084192338232, + "loss": 3.4357, + "mean_token_accuracy": 0.36764633655548096, + "num_tokens": 4377568255.0, + "step": 8563 + }, + { + "epoch": 2.3158464034613306, + "grad_norm": 2.703125, + "learning_rate": 0.012505529940167712, + "loss": 3.0492, + "mean_token_accuracy": 0.35246843099594116, + "num_tokens": 4378092517.0, + "step": 8564 + }, + { + "epoch": 2.31611681990265, + "grad_norm": 2.65625, + "learning_rate": 0.012503975641803108, + "loss": 3.0383, + "mean_token_accuracy": 0.3902289867401123, + "num_tokens": 4378616759.0, + "step": 8565 + }, + { + "epoch": 2.31638723634397, + "grad_norm": 2.578125, + "learning_rate": 0.012502421297292106, + "loss": 3.1509, + "mean_token_accuracy": 0.3994755148887634, + "num_tokens": 4379140914.0, + "step": 8566 + }, + { + "epoch": 2.316657652785289, + "grad_norm": 2.53125, + "learning_rate": 0.0125008669066824, + "loss": 3.1282, + "mean_token_accuracy": 0.4089150130748749, + "num_tokens": 4379665165.0, + "step": 8567 + }, + { + "epoch": 2.316928069226609, + "grad_norm": 2.65625, + "learning_rate": 0.012499312470021685, + "loss": 3.1525, + "mean_token_accuracy": 0.3948008418083191, + "num_tokens": 4380189276.0, + "step": 8568 + }, + { + "epoch": 2.3171984856679284, + "grad_norm": 2.890625, + "learning_rate": 0.012497757987357652, + "loss": 3.0021, + "mean_token_accuracy": 0.408333420753479, + "num_tokens": 4380713500.0, + "step": 8569 + }, + { + "epoch": 2.317468902109248, + "grad_norm": 3.578125, + "learning_rate": 0.012496203458738, + "loss": 3.1367, + "mean_token_accuracy": 0.4005547761917114, + "num_tokens": 4381237742.0, + "step": 8570 + }, + { + "epoch": 2.3177393185505677, + "grad_norm": 43.5, + "learning_rate": 0.012494648884210426, + "loss": 19.1836, + "mean_token_accuracy": 0.03824175149202347, + "num_tokens": 4381761886.0, + "step": 8571 + }, + { + "epoch": 2.3180097349918873, + "grad_norm": 9.1875, + "learning_rate": 0.01249309426382263, + "loss": 3.6134, + "mean_token_accuracy": 0.32768216729164124, + "num_tokens": 4382286147.0, + "step": 8572 + }, + { + "epoch": 2.318280151433207, + "grad_norm": 2.53125, + "learning_rate": 0.012491539597622312, + "loss": 3.4479, + "mean_token_accuracy": 0.3790997862815857, + "num_tokens": 4382748439.0, + "step": 8573 + }, + { + "epoch": 2.3185505678745266, + "grad_norm": 2.4375, + "learning_rate": 0.01248998488565717, + "loss": 3.2548, + "mean_token_accuracy": 0.39256054162979126, + "num_tokens": 4383272634.0, + "step": 8574 + }, + { + "epoch": 2.3188209843158463, + "grad_norm": 4.625, + "learning_rate": 0.012488430127974915, + "loss": 3.3949, + "mean_token_accuracy": 0.3787413239479065, + "num_tokens": 4383796814.0, + "step": 8575 + }, + { + "epoch": 2.319091400757166, + "grad_norm": 2.90625, + "learning_rate": 0.01248687532462324, + "loss": 3.1994, + "mean_token_accuracy": 0.39721572399139404, + "num_tokens": 4384320983.0, + "step": 8576 + }, + { + "epoch": 2.3193618171984856, + "grad_norm": 3.125, + "learning_rate": 0.012485320475649857, + "loss": 3.1817, + "mean_token_accuracy": 0.3891454339027405, + "num_tokens": 4384834943.0, + "step": 8577 + }, + { + "epoch": 2.319632233639805, + "grad_norm": 2.46875, + "learning_rate": 0.012483765581102478, + "loss": 3.1833, + "mean_token_accuracy": 0.3966245949268341, + "num_tokens": 4385322329.0, + "step": 8578 + }, + { + "epoch": 2.319902650081125, + "grad_norm": 2.78125, + "learning_rate": 0.012482210641028809, + "loss": 3.1794, + "mean_token_accuracy": 0.39004194736480713, + "num_tokens": 4385846595.0, + "step": 8579 + }, + { + "epoch": 2.3201730665224445, + "grad_norm": 2.671875, + "learning_rate": 0.012480655655476553, + "loss": 3.1236, + "mean_token_accuracy": 0.40393462777137756, + "num_tokens": 4386370876.0, + "step": 8580 + }, + { + "epoch": 2.320443482963764, + "grad_norm": 2.453125, + "learning_rate": 0.012479100624493434, + "loss": 3.1555, + "mean_token_accuracy": 0.40775027871131897, + "num_tokens": 4386895067.0, + "step": 8581 + }, + { + "epoch": 2.3207138994050838, + "grad_norm": 2.546875, + "learning_rate": 0.012477545548127154, + "loss": 3.0532, + "mean_token_accuracy": 0.3959271311759949, + "num_tokens": 4387419254.0, + "step": 8582 + }, + { + "epoch": 2.3209843158464034, + "grad_norm": 2.546875, + "learning_rate": 0.012475990426425427, + "loss": 3.0668, + "mean_token_accuracy": 0.39237067103385925, + "num_tokens": 4387943450.0, + "step": 8583 + }, + { + "epoch": 2.321254732287723, + "grad_norm": 2.96875, + "learning_rate": 0.012474435259435979, + "loss": 3.2532, + "mean_token_accuracy": 0.36833250522613525, + "num_tokens": 4388467665.0, + "step": 8584 + }, + { + "epoch": 2.3215251487290427, + "grad_norm": 2.578125, + "learning_rate": 0.01247288004720652, + "loss": 3.1631, + "mean_token_accuracy": 0.38561102747917175, + "num_tokens": 4388991834.0, + "step": 8585 + }, + { + "epoch": 2.3217955651703623, + "grad_norm": 2.890625, + "learning_rate": 0.012471324789784771, + "loss": 3.0812, + "mean_token_accuracy": 0.3979509770870209, + "num_tokens": 4389490991.0, + "step": 8586 + }, + { + "epoch": 2.322065981611682, + "grad_norm": 3.03125, + "learning_rate": 0.012469769487218448, + "loss": 3.3138, + "mean_token_accuracy": 0.4029107391834259, + "num_tokens": 4389958579.0, + "step": 8587 + }, + { + "epoch": 2.3223363980530016, + "grad_norm": 2.75, + "learning_rate": 0.012468214139555275, + "loss": 2.976, + "mean_token_accuracy": 0.4103637635707855, + "num_tokens": 4390482715.0, + "step": 8588 + }, + { + "epoch": 2.3226068144943213, + "grad_norm": 2.828125, + "learning_rate": 0.012466658746842978, + "loss": 3.222, + "mean_token_accuracy": 0.4066270589828491, + "num_tokens": 4391006886.0, + "step": 8589 + }, + { + "epoch": 2.322877230935641, + "grad_norm": 3.203125, + "learning_rate": 0.012465103309129276, + "loss": 3.1017, + "mean_token_accuracy": 0.41103893518447876, + "num_tokens": 4391511749.0, + "step": 8590 + }, + { + "epoch": 2.3231476473769606, + "grad_norm": 112.5, + "learning_rate": 0.012463547826461892, + "loss": 11.2652, + "mean_token_accuracy": 0.013128786347806454, + "num_tokens": 4391980610.0, + "step": 8591 + }, + { + "epoch": 2.32341806381828, + "grad_norm": 5.375, + "learning_rate": 0.012461992298888558, + "loss": 3.2927, + "mean_token_accuracy": 0.3828970193862915, + "num_tokens": 4392504870.0, + "step": 8592 + }, + { + "epoch": 2.3236884802596, + "grad_norm": 2.71875, + "learning_rate": 0.012460436726457002, + "loss": 3.4055, + "mean_token_accuracy": 0.3999921679496765, + "num_tokens": 4392999093.0, + "step": 8593 + }, + { + "epoch": 2.3239588967009195, + "grad_norm": 2.796875, + "learning_rate": 0.012458881109214953, + "loss": 3.1941, + "mean_token_accuracy": 0.38675954937934875, + "num_tokens": 4393523333.0, + "step": 8594 + }, + { + "epoch": 2.324229313142239, + "grad_norm": 3.234375, + "learning_rate": 0.012457325447210144, + "loss": 3.2036, + "mean_token_accuracy": 0.4018789529800415, + "num_tokens": 4394030914.0, + "step": 8595 + }, + { + "epoch": 2.3244997295835588, + "grad_norm": 3.546875, + "learning_rate": 0.012455769740490301, + "loss": 3.1761, + "mean_token_accuracy": 0.379341185092926, + "num_tokens": 4394514813.0, + "step": 8596 + }, + { + "epoch": 2.3247701460248784, + "grad_norm": 2.546875, + "learning_rate": 0.012454213989103161, + "loss": 3.1439, + "mean_token_accuracy": 0.3957883417606354, + "num_tokens": 4394982145.0, + "step": 8597 + }, + { + "epoch": 2.325040562466198, + "grad_norm": 3.09375, + "learning_rate": 0.01245265819309646, + "loss": 3.0511, + "mean_token_accuracy": 0.41400346159935, + "num_tokens": 4395445107.0, + "step": 8598 + }, + { + "epoch": 2.3253109789075177, + "grad_norm": 2.375, + "learning_rate": 0.012451102352517934, + "loss": 3.2389, + "mean_token_accuracy": 0.37496548891067505, + "num_tokens": 4395923363.0, + "step": 8599 + }, + { + "epoch": 2.3255813953488373, + "grad_norm": 2.546875, + "learning_rate": 0.01244954646741532, + "loss": 2.9993, + "mean_token_accuracy": 0.44308415055274963, + "num_tokens": 4396343560.0, + "step": 8600 + }, + { + "epoch": 2.325851811790157, + "grad_norm": 3.25, + "learning_rate": 0.012447990537836358, + "loss": 3.108, + "mean_token_accuracy": 0.4114904999732971, + "num_tokens": 4396867537.0, + "step": 8601 + }, + { + "epoch": 2.3261222282314766, + "grad_norm": 3.484375, + "learning_rate": 0.012446434563828785, + "loss": 3.2261, + "mean_token_accuracy": 0.39580512046813965, + "num_tokens": 4397391742.0, + "step": 8602 + }, + { + "epoch": 2.3263926446727963, + "grad_norm": 2.9375, + "learning_rate": 0.01244487854544035, + "loss": 3.2982, + "mean_token_accuracy": 0.39355987310409546, + "num_tokens": 4397916009.0, + "step": 8603 + }, + { + "epoch": 2.326663061114116, + "grad_norm": 2.546875, + "learning_rate": 0.01244332248271879, + "loss": 3.1868, + "mean_token_accuracy": 0.3820264935493469, + "num_tokens": 4398440231.0, + "step": 8604 + }, + { + "epoch": 2.3269334775554356, + "grad_norm": 3.265625, + "learning_rate": 0.012441766375711854, + "loss": 2.9507, + "mean_token_accuracy": 0.3912726044654846, + "num_tokens": 4398964319.0, + "step": 8605 + }, + { + "epoch": 2.3272038939967548, + "grad_norm": 2.390625, + "learning_rate": 0.012440210224467283, + "loss": 3.1104, + "mean_token_accuracy": 0.4102725386619568, + "num_tokens": 4399392669.0, + "step": 8606 + }, + { + "epoch": 2.327474310438075, + "grad_norm": 2.546875, + "learning_rate": 0.012438654029032827, + "loss": 3.1989, + "mean_token_accuracy": 0.37748533487319946, + "num_tokens": 4399916949.0, + "step": 8607 + }, + { + "epoch": 2.327744726879394, + "grad_norm": 3.5625, + "learning_rate": 0.012437097789456238, + "loss": 2.9988, + "mean_token_accuracy": 0.4223981499671936, + "num_tokens": 4400441217.0, + "step": 8608 + }, + { + "epoch": 2.328015143320714, + "grad_norm": 3.5625, + "learning_rate": 0.012435541505785259, + "loss": 3.3554, + "mean_token_accuracy": 0.38741937279701233, + "num_tokens": 4400965459.0, + "step": 8609 + }, + { + "epoch": 2.3282855597620333, + "grad_norm": 3.703125, + "learning_rate": 0.012433985178067646, + "loss": 3.1518, + "mean_token_accuracy": 0.3899764120578766, + "num_tokens": 4401489659.0, + "step": 8610 + }, + { + "epoch": 2.328555976203353, + "grad_norm": 264.0, + "learning_rate": 0.01243242880635115, + "loss": 11.2321, + "mean_token_accuracy": 0.00722785759717226, + "num_tokens": 4401910919.0, + "step": 8611 + }, + { + "epoch": 2.3288263926446726, + "grad_norm": 4.65625, + "learning_rate": 0.012430872390683528, + "loss": 3.4135, + "mean_token_accuracy": 0.3660550117492676, + "num_tokens": 4402435191.0, + "step": 8612 + }, + { + "epoch": 2.3290968090859923, + "grad_norm": 1.78125, + "learning_rate": 0.012429315931112529, + "loss": 3.0925, + "mean_token_accuracy": 0.390617698431015, + "num_tokens": 4402959467.0, + "step": 8613 + }, + { + "epoch": 2.329367225527312, + "grad_norm": 2.375, + "learning_rate": 0.012427759427685918, + "loss": 3.294, + "mean_token_accuracy": 0.39561375975608826, + "num_tokens": 4403434260.0, + "step": 8614 + }, + { + "epoch": 2.3296376419686315, + "grad_norm": 2.953125, + "learning_rate": 0.012426202880451449, + "loss": 2.9198, + "mean_token_accuracy": 0.41472533345222473, + "num_tokens": 4403958452.0, + "step": 8615 + }, + { + "epoch": 2.329908058409951, + "grad_norm": 2.234375, + "learning_rate": 0.012424646289456878, + "loss": 3.1114, + "mean_token_accuracy": 0.4204922616481781, + "num_tokens": 4404482665.0, + "step": 8616 + }, + { + "epoch": 2.330178474851271, + "grad_norm": 2.671875, + "learning_rate": 0.012423089654749971, + "loss": 3.104, + "mean_token_accuracy": 0.395710825920105, + "num_tokens": 4405006921.0, + "step": 8617 + }, + { + "epoch": 2.3304488912925905, + "grad_norm": 2.703125, + "learning_rate": 0.01242153297637849, + "loss": 3.1103, + "mean_token_accuracy": 0.4254133701324463, + "num_tokens": 4405531181.0, + "step": 8618 + }, + { + "epoch": 2.33071930773391, + "grad_norm": 3.140625, + "learning_rate": 0.012419976254390192, + "loss": 3.0651, + "mean_token_accuracy": 0.38819271326065063, + "num_tokens": 4406055389.0, + "step": 8619 + }, + { + "epoch": 2.3309897241752298, + "grad_norm": 2.734375, + "learning_rate": 0.012418419488832851, + "loss": 2.9422, + "mean_token_accuracy": 0.40408360958099365, + "num_tokens": 4406547202.0, + "step": 8620 + }, + { + "epoch": 2.3312601406165494, + "grad_norm": 2.875, + "learning_rate": 0.012416862679754228, + "loss": 2.687, + "mean_token_accuracy": 0.45039862394332886, + "num_tokens": 4407033753.0, + "step": 8621 + }, + { + "epoch": 2.331530557057869, + "grad_norm": 2.421875, + "learning_rate": 0.012415305827202093, + "loss": 3.076, + "mean_token_accuracy": 0.4130874276161194, + "num_tokens": 4407501765.0, + "step": 8622 + }, + { + "epoch": 2.3318009734991887, + "grad_norm": 3.59375, + "learning_rate": 0.01241374893122421, + "loss": 3.369, + "mean_token_accuracy": 0.37450337409973145, + "num_tokens": 4408026022.0, + "step": 8623 + }, + { + "epoch": 2.3320713899405083, + "grad_norm": 3.125, + "learning_rate": 0.012412191991868354, + "loss": 2.9984, + "mean_token_accuracy": 0.3986714780330658, + "num_tokens": 4408496318.0, + "step": 8624 + }, + { + "epoch": 2.332341806381828, + "grad_norm": 2.421875, + "learning_rate": 0.012410635009182294, + "loss": 3.0712, + "mean_token_accuracy": 0.4024854600429535, + "num_tokens": 4409020571.0, + "step": 8625 + }, + { + "epoch": 2.3326122228231476, + "grad_norm": 3.03125, + "learning_rate": 0.012409077983213801, + "loss": 3.1888, + "mean_token_accuracy": 0.390681654214859, + "num_tokens": 4409544752.0, + "step": 8626 + }, + { + "epoch": 2.3328826392644673, + "grad_norm": 2.4375, + "learning_rate": 0.012407520914010655, + "loss": 3.1716, + "mean_token_accuracy": 0.40577584505081177, + "num_tokens": 4410054596.0, + "step": 8627 + }, + { + "epoch": 2.333153055705787, + "grad_norm": 2.546875, + "learning_rate": 0.012405963801620628, + "loss": 3.1572, + "mean_token_accuracy": 0.39351868629455566, + "num_tokens": 4410578857.0, + "step": 8628 + }, + { + "epoch": 2.3334234721471065, + "grad_norm": 2.90625, + "learning_rate": 0.012404406646091494, + "loss": 3.2722, + "mean_token_accuracy": 0.390292763710022, + "num_tokens": 4411077049.0, + "step": 8629 + }, + { + "epoch": 2.333693888588426, + "grad_norm": 2.609375, + "learning_rate": 0.012402849447471036, + "loss": 2.8219, + "mean_token_accuracy": 0.418903112411499, + "num_tokens": 4411560459.0, + "step": 8630 + }, + { + "epoch": 2.333964305029746, + "grad_norm": 25.875, + "learning_rate": 0.012401292205807036, + "loss": 9.9974, + "mean_token_accuracy": 0.031101737171411514, + "num_tokens": 4412084720.0, + "step": 8631 + }, + { + "epoch": 2.3342347214710655, + "grad_norm": 5.125, + "learning_rate": 0.012399734921147263, + "loss": 3.3783, + "mean_token_accuracy": 0.36386436223983765, + "num_tokens": 4412608993.0, + "step": 8632 + }, + { + "epoch": 2.334505137912385, + "grad_norm": 1.515625, + "learning_rate": 0.012398177593539507, + "loss": 3.0945, + "mean_token_accuracy": 0.4097273349761963, + "num_tokens": 4413114545.0, + "step": 8633 + }, + { + "epoch": 2.3347755543537048, + "grad_norm": 3.0625, + "learning_rate": 0.012396620223031554, + "loss": 3.2928, + "mean_token_accuracy": 0.37669306993484497, + "num_tokens": 4413638771.0, + "step": 8634 + }, + { + "epoch": 2.3350459707950244, + "grad_norm": 3.03125, + "learning_rate": 0.012395062809671184, + "loss": 3.1188, + "mean_token_accuracy": 0.38533514738082886, + "num_tokens": 4414163051.0, + "step": 8635 + }, + { + "epoch": 2.335316387236344, + "grad_norm": 2.875, + "learning_rate": 0.012393505353506185, + "loss": 3.1815, + "mean_token_accuracy": 0.40011149644851685, + "num_tokens": 4414687311.0, + "step": 8636 + }, + { + "epoch": 2.3355868036776637, + "grad_norm": 3.15625, + "learning_rate": 0.012391947854584345, + "loss": 3.0527, + "mean_token_accuracy": 0.41725969314575195, + "num_tokens": 4415205413.0, + "step": 8637 + }, + { + "epoch": 2.3358572201189833, + "grad_norm": 3.0, + "learning_rate": 0.01239039031295345, + "loss": 3.0812, + "mean_token_accuracy": 0.38202381134033203, + "num_tokens": 4415729674.0, + "step": 8638 + }, + { + "epoch": 2.336127636560303, + "grad_norm": 2.828125, + "learning_rate": 0.012388832728661288, + "loss": 3.1364, + "mean_token_accuracy": 0.4114217758178711, + "num_tokens": 4416253933.0, + "step": 8639 + }, + { + "epoch": 2.3363980530016226, + "grad_norm": 3.125, + "learning_rate": 0.012387275101755657, + "loss": 3.0162, + "mean_token_accuracy": 0.43976151943206787, + "num_tokens": 4416778107.0, + "step": 8640 + }, + { + "epoch": 2.3366684694429423, + "grad_norm": 3.5625, + "learning_rate": 0.012385717432284345, + "loss": 3.3617, + "mean_token_accuracy": 0.4034360945224762, + "num_tokens": 4417206581.0, + "step": 8641 + }, + { + "epoch": 2.336938885884262, + "grad_norm": 3.25, + "learning_rate": 0.012384159720295147, + "loss": 3.0967, + "mean_token_accuracy": 0.39492470026016235, + "num_tokens": 4417730703.0, + "step": 8642 + }, + { + "epoch": 2.3372093023255816, + "grad_norm": 3.4375, + "learning_rate": 0.012382601965835861, + "loss": 3.0848, + "mean_token_accuracy": 0.41173094511032104, + "num_tokens": 4418201559.0, + "step": 8643 + }, + { + "epoch": 2.337479718766901, + "grad_norm": 3.015625, + "learning_rate": 0.012381044168954278, + "loss": 3.1488, + "mean_token_accuracy": 0.38306331634521484, + "num_tokens": 4418725571.0, + "step": 8644 + }, + { + "epoch": 2.337750135208221, + "grad_norm": 3.296875, + "learning_rate": 0.0123794863296982, + "loss": 3.0537, + "mean_token_accuracy": 0.4227633476257324, + "num_tokens": 4419249827.0, + "step": 8645 + }, + { + "epoch": 2.3380205516495405, + "grad_norm": 3.328125, + "learning_rate": 0.012377928448115423, + "loss": 3.1438, + "mean_token_accuracy": 0.4029121994972229, + "num_tokens": 4419774050.0, + "step": 8646 + }, + { + "epoch": 2.3382909680908597, + "grad_norm": 3.4375, + "learning_rate": 0.012376370524253751, + "loss": 3.2294, + "mean_token_accuracy": 0.40045201778411865, + "num_tokens": 4420298289.0, + "step": 8647 + }, + { + "epoch": 2.3385613845321798, + "grad_norm": 3.21875, + "learning_rate": 0.012374812558160982, + "loss": 3.0952, + "mean_token_accuracy": 0.4019858241081238, + "num_tokens": 4420822351.0, + "step": 8648 + }, + { + "epoch": 2.338831800973499, + "grad_norm": 2.6875, + "learning_rate": 0.01237325454988492, + "loss": 3.1021, + "mean_token_accuracy": 0.4032764136791229, + "num_tokens": 4421346581.0, + "step": 8649 + }, + { + "epoch": 2.339102217414819, + "grad_norm": 3.140625, + "learning_rate": 0.012371696499473372, + "loss": 3.0722, + "mean_token_accuracy": 0.4101347327232361, + "num_tokens": 4421870689.0, + "step": 8650 + }, + { + "epoch": 2.3393726338561383, + "grad_norm": 398.0, + "learning_rate": 0.012370138406974141, + "loss": 30.6204, + "mean_token_accuracy": 2.7454721930553205e-05, + "num_tokens": 4422394909.0, + "step": 8651 + }, + { + "epoch": 2.339643050297458, + "grad_norm": 8.625, + "learning_rate": 0.012368580272435033, + "loss": 3.724, + "mean_token_accuracy": 0.37765997648239136, + "num_tokens": 4422919010.0, + "step": 8652 + }, + { + "epoch": 2.3399134667387775, + "grad_norm": 2.6875, + "learning_rate": 0.012367022095903858, + "loss": 3.328, + "mean_token_accuracy": 0.3841484785079956, + "num_tokens": 4423443184.0, + "step": 8653 + }, + { + "epoch": 2.340183883180097, + "grad_norm": 2.125, + "learning_rate": 0.012365463877428425, + "loss": 3.2262, + "mean_token_accuracy": 0.3909277319908142, + "num_tokens": 4423967331.0, + "step": 8654 + }, + { + "epoch": 2.340454299621417, + "grad_norm": 3.4375, + "learning_rate": 0.012363905617056543, + "loss": 2.9454, + "mean_token_accuracy": 0.39205896854400635, + "num_tokens": 4424491545.0, + "step": 8655 + }, + { + "epoch": 2.3407247160627365, + "grad_norm": 2.15625, + "learning_rate": 0.012362347314836028, + "loss": 3.3992, + "mean_token_accuracy": 0.379785418510437, + "num_tokens": 4425011427.0, + "step": 8656 + }, + { + "epoch": 2.340995132504056, + "grad_norm": 2.546875, + "learning_rate": 0.012360788970814694, + "loss": 3.1653, + "mean_token_accuracy": 0.3969328999519348, + "num_tokens": 4425535639.0, + "step": 8657 + }, + { + "epoch": 2.3412655489453758, + "grad_norm": 2.5625, + "learning_rate": 0.012359230585040348, + "loss": 3.1289, + "mean_token_accuracy": 0.40964072942733765, + "num_tokens": 4426059850.0, + "step": 8658 + }, + { + "epoch": 2.3415359653866954, + "grad_norm": 3.09375, + "learning_rate": 0.012357672157560812, + "loss": 3.3012, + "mean_token_accuracy": 0.3943014144897461, + "num_tokens": 4426584099.0, + "step": 8659 + }, + { + "epoch": 2.341806381828015, + "grad_norm": 3.4375, + "learning_rate": 0.012356113688423903, + "loss": 2.9882, + "mean_token_accuracy": 0.4286959767341614, + "num_tokens": 4427072475.0, + "step": 8660 + }, + { + "epoch": 2.3420767982693347, + "grad_norm": 2.453125, + "learning_rate": 0.012354555177677431, + "loss": 2.9325, + "mean_token_accuracy": 0.41405874490737915, + "num_tokens": 4427596574.0, + "step": 8661 + }, + { + "epoch": 2.3423472147106543, + "grad_norm": 3.0, + "learning_rate": 0.01235299662536923, + "loss": 3.1948, + "mean_token_accuracy": 0.38789990544319153, + "num_tokens": 4428120767.0, + "step": 8662 + }, + { + "epoch": 2.342617631151974, + "grad_norm": 2.359375, + "learning_rate": 0.012351438031547111, + "loss": 2.915, + "mean_token_accuracy": 0.4124939441680908, + "num_tokens": 4428637683.0, + "step": 8663 + }, + { + "epoch": 2.3428880475932936, + "grad_norm": 3.109375, + "learning_rate": 0.0123498793962589, + "loss": 3.1082, + "mean_token_accuracy": 0.4128643274307251, + "num_tokens": 4429161809.0, + "step": 8664 + }, + { + "epoch": 2.3431584640346133, + "grad_norm": 2.765625, + "learning_rate": 0.012348320719552417, + "loss": 3.4326, + "mean_token_accuracy": 0.3535691201686859, + "num_tokens": 4429685987.0, + "step": 8665 + }, + { + "epoch": 2.343428880475933, + "grad_norm": 2.9375, + "learning_rate": 0.01234676200147549, + "loss": 3.1101, + "mean_token_accuracy": 0.38491618633270264, + "num_tokens": 4430210242.0, + "step": 8666 + }, + { + "epoch": 2.3436992969172525, + "grad_norm": 3.0625, + "learning_rate": 0.012345203242075949, + "loss": 3.336, + "mean_token_accuracy": 0.37991100549697876, + "num_tokens": 4430734366.0, + "step": 8667 + }, + { + "epoch": 2.343969713358572, + "grad_norm": 3.609375, + "learning_rate": 0.012343644441401609, + "loss": 3.1624, + "mean_token_accuracy": 0.3860401213169098, + "num_tokens": 4431253270.0, + "step": 8668 + }, + { + "epoch": 2.344240129799892, + "grad_norm": 2.90625, + "learning_rate": 0.01234208559950031, + "loss": 2.9685, + "mean_token_accuracy": 0.4199235737323761, + "num_tokens": 4431741580.0, + "step": 8669 + }, + { + "epoch": 2.3445105462412115, + "grad_norm": 2.796875, + "learning_rate": 0.01234052671641988, + "loss": 3.1953, + "mean_token_accuracy": 0.38740143179893494, + "num_tokens": 4432265799.0, + "step": 8670 + }, + { + "epoch": 2.344780962682531, + "grad_norm": 19.5, + "learning_rate": 0.012338967792208147, + "loss": 11.8696, + "mean_token_accuracy": 0.03960971534252167, + "num_tokens": 4432738633.0, + "step": 8671 + }, + { + "epoch": 2.3450513791238508, + "grad_norm": 5.78125, + "learning_rate": 0.012337408826912942, + "loss": 3.3681, + "mean_token_accuracy": 0.3635762929916382, + "num_tokens": 4433209166.0, + "step": 8672 + }, + { + "epoch": 2.3453217955651704, + "grad_norm": 1.90625, + "learning_rate": 0.012335849820582103, + "loss": 3.1698, + "mean_token_accuracy": 0.4044809937477112, + "num_tokens": 4433727782.0, + "step": 8673 + }, + { + "epoch": 2.34559221200649, + "grad_norm": 3.609375, + "learning_rate": 0.012334290773263463, + "loss": 3.3189, + "mean_token_accuracy": 0.3793265223503113, + "num_tokens": 4434251994.0, + "step": 8674 + }, + { + "epoch": 2.3458626284478097, + "grad_norm": 2.875, + "learning_rate": 0.012332731685004858, + "loss": 3.0758, + "mean_token_accuracy": 0.41063714027404785, + "num_tokens": 4434776274.0, + "step": 8675 + }, + { + "epoch": 2.3461330448891293, + "grad_norm": 3.1875, + "learning_rate": 0.012331172555854123, + "loss": 3.3378, + "mean_token_accuracy": 0.38376057147979736, + "num_tokens": 4435298599.0, + "step": 8676 + }, + { + "epoch": 2.346403461330449, + "grad_norm": 2.46875, + "learning_rate": 0.012329613385859101, + "loss": 3.1258, + "mean_token_accuracy": 0.41557982563972473, + "num_tokens": 4435804947.0, + "step": 8677 + }, + { + "epoch": 2.3466738777717686, + "grad_norm": 2.671875, + "learning_rate": 0.012328054175067632, + "loss": 3.1552, + "mean_token_accuracy": 0.38078999519348145, + "num_tokens": 4436329036.0, + "step": 8678 + }, + { + "epoch": 2.3469442942130883, + "grad_norm": 2.359375, + "learning_rate": 0.012326494923527555, + "loss": 3.3934, + "mean_token_accuracy": 0.3826846480369568, + "num_tokens": 4436853301.0, + "step": 8679 + }, + { + "epoch": 2.347214710654408, + "grad_norm": 2.796875, + "learning_rate": 0.01232493563128671, + "loss": 3.0, + "mean_token_accuracy": 0.41098934412002563, + "num_tokens": 4437377583.0, + "step": 8680 + }, + { + "epoch": 2.3474851270957275, + "grad_norm": 2.921875, + "learning_rate": 0.012323376298392948, + "loss": 2.9908, + "mean_token_accuracy": 0.3995947241783142, + "num_tokens": 4437901815.0, + "step": 8681 + }, + { + "epoch": 2.347755543537047, + "grad_norm": 2.890625, + "learning_rate": 0.012321816924894105, + "loss": 3.3857, + "mean_token_accuracy": 0.38621407747268677, + "num_tokens": 4438426080.0, + "step": 8682 + }, + { + "epoch": 2.348025959978367, + "grad_norm": 3.96875, + "learning_rate": 0.012320257510838033, + "loss": 3.1932, + "mean_token_accuracy": 0.37760478258132935, + "num_tokens": 4438950354.0, + "step": 8683 + }, + { + "epoch": 2.3482963764196865, + "grad_norm": 3.0, + "learning_rate": 0.01231869805627258, + "loss": 2.9287, + "mean_token_accuracy": 0.39814576506614685, + "num_tokens": 4439474594.0, + "step": 8684 + }, + { + "epoch": 2.348566792861006, + "grad_norm": 3.03125, + "learning_rate": 0.012317138561245588, + "loss": 3.0234, + "mean_token_accuracy": 0.3903089761734009, + "num_tokens": 4439998664.0, + "step": 8685 + }, + { + "epoch": 2.3488372093023258, + "grad_norm": 3.0, + "learning_rate": 0.012315579025804916, + "loss": 3.1927, + "mean_token_accuracy": 0.3812127113342285, + "num_tokens": 4440522857.0, + "step": 8686 + }, + { + "epoch": 2.3491076257436454, + "grad_norm": 2.9375, + "learning_rate": 0.012314019449998409, + "loss": 3.2883, + "mean_token_accuracy": 0.3964262008666992, + "num_tokens": 4441047050.0, + "step": 8687 + }, + { + "epoch": 2.3493780421849646, + "grad_norm": 2.953125, + "learning_rate": 0.01231245983387392, + "loss": 3.3412, + "mean_token_accuracy": 0.37301093339920044, + "num_tokens": 4441542109.0, + "step": 8688 + }, + { + "epoch": 2.3496484586262847, + "grad_norm": 2.875, + "learning_rate": 0.012310900177479305, + "loss": 3.1301, + "mean_token_accuracy": 0.3991876542568207, + "num_tokens": 4442066368.0, + "step": 8689 + }, + { + "epoch": 2.349918875067604, + "grad_norm": 4.0, + "learning_rate": 0.012309340480862416, + "loss": 3.2351, + "mean_token_accuracy": 0.39134323596954346, + "num_tokens": 4442590549.0, + "step": 8690 + }, + { + "epoch": 2.350189291508924, + "grad_norm": 20.375, + "learning_rate": 0.012307780744071111, + "loss": 9.9504, + "mean_token_accuracy": 0.014387029223144054, + "num_tokens": 4443114830.0, + "step": 8691 + }, + { + "epoch": 2.350459707950243, + "grad_norm": 5.75, + "learning_rate": 0.012306220967153248, + "loss": 3.5993, + "mean_token_accuracy": 0.3436817526817322, + "num_tokens": 4443639103.0, + "step": 8692 + }, + { + "epoch": 2.350730124391563, + "grad_norm": 2.171875, + "learning_rate": 0.012304661150156684, + "loss": 3.2783, + "mean_token_accuracy": 0.3606610596179962, + "num_tokens": 4444163244.0, + "step": 8693 + }, + { + "epoch": 2.3510005408328825, + "grad_norm": 2.5, + "learning_rate": 0.01230310129312928, + "loss": 3.159, + "mean_token_accuracy": 0.38718560338020325, + "num_tokens": 4444687496.0, + "step": 8694 + }, + { + "epoch": 2.351270957274202, + "grad_norm": 2.75, + "learning_rate": 0.012301541396118894, + "loss": 3.1547, + "mean_token_accuracy": 0.40218597650527954, + "num_tokens": 4445211676.0, + "step": 8695 + }, + { + "epoch": 2.3515413737155217, + "grad_norm": 2.59375, + "learning_rate": 0.01229998145917339, + "loss": 3.1174, + "mean_token_accuracy": 0.39677390456199646, + "num_tokens": 4445735829.0, + "step": 8696 + }, + { + "epoch": 2.3518117901568414, + "grad_norm": 2.9375, + "learning_rate": 0.012298421482340633, + "loss": 3.0972, + "mean_token_accuracy": 0.4137679636478424, + "num_tokens": 4446248905.0, + "step": 8697 + }, + { + "epoch": 2.352082206598161, + "grad_norm": 3.53125, + "learning_rate": 0.012296861465668487, + "loss": 3.1774, + "mean_token_accuracy": 0.39313408732414246, + "num_tokens": 4446773157.0, + "step": 8698 + }, + { + "epoch": 2.3523526230394807, + "grad_norm": 2.75, + "learning_rate": 0.012295301409204817, + "loss": 2.8224, + "mean_token_accuracy": 0.40943464636802673, + "num_tokens": 4447274968.0, + "step": 8699 + }, + { + "epoch": 2.3526230394808003, + "grad_norm": 3.546875, + "learning_rate": 0.012293741312997493, + "loss": 2.9075, + "mean_token_accuracy": 0.4077015519142151, + "num_tokens": 4447775838.0, + "step": 8700 + }, + { + "epoch": 2.35289345592212, + "grad_norm": 2.921875, + "learning_rate": 0.01229218117709438, + "loss": 3.0943, + "mean_token_accuracy": 0.40080398321151733, + "num_tokens": 4448300068.0, + "step": 8701 + }, + { + "epoch": 2.3531638723634396, + "grad_norm": 2.890625, + "learning_rate": 0.012290621001543345, + "loss": 3.1594, + "mean_token_accuracy": 0.3704937696456909, + "num_tokens": 4448824271.0, + "step": 8702 + }, + { + "epoch": 2.3534342888047592, + "grad_norm": 2.640625, + "learning_rate": 0.012289060786392263, + "loss": 3.0085, + "mean_token_accuracy": 0.43275898694992065, + "num_tokens": 4449314327.0, + "step": 8703 + }, + { + "epoch": 2.353704705246079, + "grad_norm": 2.765625, + "learning_rate": 0.012287500531689007, + "loss": 3.319, + "mean_token_accuracy": 0.3824532628059387, + "num_tokens": 4449781473.0, + "step": 8704 + }, + { + "epoch": 2.3539751216873985, + "grad_norm": 3.0, + "learning_rate": 0.012285940237481446, + "loss": 2.9974, + "mean_token_accuracy": 0.4032413959503174, + "num_tokens": 4450305637.0, + "step": 8705 + }, + { + "epoch": 2.354245538128718, + "grad_norm": 2.390625, + "learning_rate": 0.012284379903817462, + "loss": 3.0532, + "mean_token_accuracy": 0.4089217782020569, + "num_tokens": 4450791190.0, + "step": 8706 + }, + { + "epoch": 2.354515954570038, + "grad_norm": 2.6875, + "learning_rate": 0.01228281953074492, + "loss": 3.1784, + "mean_token_accuracy": 0.3913789987564087, + "num_tokens": 4451315473.0, + "step": 8707 + }, + { + "epoch": 2.3547863710113575, + "grad_norm": 2.59375, + "learning_rate": 0.012281259118311705, + "loss": 3.0997, + "mean_token_accuracy": 0.40377306938171387, + "num_tokens": 4451812543.0, + "step": 8708 + }, + { + "epoch": 2.355056787452677, + "grad_norm": 3.203125, + "learning_rate": 0.01227969866656569, + "loss": 3.3219, + "mean_token_accuracy": 0.37058231234550476, + "num_tokens": 4452336650.0, + "step": 8709 + }, + { + "epoch": 2.3553272038939967, + "grad_norm": 2.515625, + "learning_rate": 0.01227813817555476, + "loss": 3.1875, + "mean_token_accuracy": 0.40325987339019775, + "num_tokens": 4452860752.0, + "step": 8710 + }, + { + "epoch": 2.3555976203353164, + "grad_norm": 46.75, + "learning_rate": 0.012276577645326787, + "loss": 11.3424, + "mean_token_accuracy": 8.578685810789466e-05, + "num_tokens": 4453320385.0, + "step": 8711 + }, + { + "epoch": 2.355868036776636, + "grad_norm": 7.65625, + "learning_rate": 0.012275017075929663, + "loss": 3.6439, + "mean_token_accuracy": 0.35097062587738037, + "num_tokens": 4453844628.0, + "step": 8712 + }, + { + "epoch": 2.3561384532179557, + "grad_norm": 2.34375, + "learning_rate": 0.012273456467411265, + "loss": 3.2223, + "mean_token_accuracy": 0.4094007611274719, + "num_tokens": 4454335040.0, + "step": 8713 + }, + { + "epoch": 2.3564088696592753, + "grad_norm": 2.609375, + "learning_rate": 0.012271895819819474, + "loss": 3.1833, + "mean_token_accuracy": 0.39821118116378784, + "num_tokens": 4454859317.0, + "step": 8714 + }, + { + "epoch": 2.356679286100595, + "grad_norm": 2.984375, + "learning_rate": 0.012270335133202179, + "loss": 3.2297, + "mean_token_accuracy": 0.40033984184265137, + "num_tokens": 4455383575.0, + "step": 8715 + }, + { + "epoch": 2.3569497025419146, + "grad_norm": 2.96875, + "learning_rate": 0.012268774407607272, + "loss": 3.4222, + "mean_token_accuracy": 0.3728600740432739, + "num_tokens": 4455853698.0, + "step": 8716 + }, + { + "epoch": 2.3572201189832342, + "grad_norm": 2.375, + "learning_rate": 0.012267213643082631, + "loss": 2.953, + "mean_token_accuracy": 0.4119889438152313, + "num_tokens": 4456377824.0, + "step": 8717 + }, + { + "epoch": 2.357490535424554, + "grad_norm": 3.125, + "learning_rate": 0.012265652839676145, + "loss": 2.9539, + "mean_token_accuracy": 0.4346826672554016, + "num_tokens": 4456802726.0, + "step": 8718 + }, + { + "epoch": 2.3577609518658735, + "grad_norm": 2.703125, + "learning_rate": 0.012264091997435713, + "loss": 3.0312, + "mean_token_accuracy": 0.3990051746368408, + "num_tokens": 4457326835.0, + "step": 8719 + }, + { + "epoch": 2.358031368307193, + "grad_norm": 3.046875, + "learning_rate": 0.012262531116409221, + "loss": 3.0194, + "mean_token_accuracy": 0.3970589339733124, + "num_tokens": 4457851044.0, + "step": 8720 + }, + { + "epoch": 2.358301784748513, + "grad_norm": 2.765625, + "learning_rate": 0.012260970196644558, + "loss": 3.0678, + "mean_token_accuracy": 0.3925601541996002, + "num_tokens": 4458375240.0, + "step": 8721 + }, + { + "epoch": 2.3585722011898325, + "grad_norm": 3.015625, + "learning_rate": 0.012259409238189625, + "loss": 2.9792, + "mean_token_accuracy": 0.40681758522987366, + "num_tokens": 4458899383.0, + "step": 8722 + }, + { + "epoch": 2.358842617631152, + "grad_norm": 2.84375, + "learning_rate": 0.012257848241092311, + "loss": 3.0944, + "mean_token_accuracy": 0.3773735761642456, + "num_tokens": 4459423578.0, + "step": 8723 + }, + { + "epoch": 2.3591130340724717, + "grad_norm": 2.703125, + "learning_rate": 0.012256287205400515, + "loss": 2.9474, + "mean_token_accuracy": 0.4245334267616272, + "num_tokens": 4459947618.0, + "step": 8724 + }, + { + "epoch": 2.3593834505137914, + "grad_norm": 2.65625, + "learning_rate": 0.012254726131162132, + "loss": 3.2993, + "mean_token_accuracy": 0.39276957511901855, + "num_tokens": 4460471896.0, + "step": 8725 + }, + { + "epoch": 2.359653866955111, + "grad_norm": 2.765625, + "learning_rate": 0.012253165018425062, + "loss": 3.0673, + "mean_token_accuracy": 0.3848520815372467, + "num_tokens": 4460996175.0, + "step": 8726 + }, + { + "epoch": 2.3599242833964307, + "grad_norm": 2.328125, + "learning_rate": 0.012251603867237202, + "loss": 2.9613, + "mean_token_accuracy": 0.39518994092941284, + "num_tokens": 4461520366.0, + "step": 8727 + }, + { + "epoch": 2.3601946998377503, + "grad_norm": 3.390625, + "learning_rate": 0.012250042677646455, + "loss": 3.156, + "mean_token_accuracy": 0.4062962532043457, + "num_tokens": 4462044564.0, + "step": 8728 + }, + { + "epoch": 2.3604651162790695, + "grad_norm": 3.234375, + "learning_rate": 0.012248481449700727, + "loss": 3.0893, + "mean_token_accuracy": 0.3820972442626953, + "num_tokens": 4462568436.0, + "step": 8729 + }, + { + "epoch": 2.3607355327203896, + "grad_norm": 3.578125, + "learning_rate": 0.012246920183447911, + "loss": 3.1965, + "mean_token_accuracy": 0.388618528842926, + "num_tokens": 4463092523.0, + "step": 8730 + }, + { + "epoch": 2.361005949161709, + "grad_norm": 6.0, + "learning_rate": 0.01224535887893592, + "loss": 10.8293, + "mean_token_accuracy": 1.4267254300648347e-05, + "num_tokens": 4463616657.0, + "step": 8731 + }, + { + "epoch": 2.361276365603029, + "grad_norm": 7.03125, + "learning_rate": 0.012243797536212653, + "loss": 3.4823, + "mean_token_accuracy": 0.35594770312309265, + "num_tokens": 4464082032.0, + "step": 8732 + }, + { + "epoch": 2.361546782044348, + "grad_norm": 37.25, + "learning_rate": 0.012242236155326021, + "loss": 3.2717, + "mean_token_accuracy": 0.3714587092399597, + "num_tokens": 4464547967.0, + "step": 8733 + }, + { + "epoch": 2.3618171984856677, + "grad_norm": 3.578125, + "learning_rate": 0.012240674736323934, + "loss": 3.2132, + "mean_token_accuracy": 0.38757967948913574, + "num_tokens": 4465025830.0, + "step": 8734 + }, + { + "epoch": 2.3620876149269874, + "grad_norm": 5.09375, + "learning_rate": 0.012239113279254294, + "loss": 2.9827, + "mean_token_accuracy": 0.4570215940475464, + "num_tokens": 4465549945.0, + "step": 8735 + }, + { + "epoch": 2.362358031368307, + "grad_norm": 8.0, + "learning_rate": 0.012237551784165017, + "loss": 3.1565, + "mean_token_accuracy": 0.376003235578537, + "num_tokens": 4466074171.0, + "step": 8736 + }, + { + "epoch": 2.3626284478096267, + "grad_norm": 1.875, + "learning_rate": 0.01223599025110401, + "loss": 3.067, + "mean_token_accuracy": 0.3850993514060974, + "num_tokens": 4466598419.0, + "step": 8737 + }, + { + "epoch": 2.3628988642509463, + "grad_norm": 3.640625, + "learning_rate": 0.012234428680119187, + "loss": 3.0709, + "mean_token_accuracy": 0.39106810092926025, + "num_tokens": 4467122683.0, + "step": 8738 + }, + { + "epoch": 2.363169280692266, + "grad_norm": 2.578125, + "learning_rate": 0.012232867071258462, + "loss": 3.3771, + "mean_token_accuracy": 0.37649428844451904, + "num_tokens": 4467646889.0, + "step": 8739 + }, + { + "epoch": 2.3634396971335856, + "grad_norm": 3.578125, + "learning_rate": 0.01223130542456975, + "loss": 3.3555, + "mean_token_accuracy": 0.39603984355926514, + "num_tokens": 4468171153.0, + "step": 8740 + }, + { + "epoch": 2.3637101135749052, + "grad_norm": 2.765625, + "learning_rate": 0.012229743740100968, + "loss": 3.029, + "mean_token_accuracy": 0.4033178687095642, + "num_tokens": 4468695340.0, + "step": 8741 + }, + { + "epoch": 2.363980530016225, + "grad_norm": 3.328125, + "learning_rate": 0.012228182017900031, + "loss": 3.1806, + "mean_token_accuracy": 0.39192962646484375, + "num_tokens": 4469219529.0, + "step": 8742 + }, + { + "epoch": 2.3642509464575445, + "grad_norm": 3.265625, + "learning_rate": 0.012226620258014858, + "loss": 3.2346, + "mean_token_accuracy": 0.37702691555023193, + "num_tokens": 4469743655.0, + "step": 8743 + }, + { + "epoch": 2.364521362898864, + "grad_norm": 2.953125, + "learning_rate": 0.012225058460493366, + "loss": 3.221, + "mean_token_accuracy": 0.35114362835884094, + "num_tokens": 4470267906.0, + "step": 8744 + }, + { + "epoch": 2.364791779340184, + "grad_norm": 2.34375, + "learning_rate": 0.01222349662538348, + "loss": 3.2419, + "mean_token_accuracy": 0.38990166783332825, + "num_tokens": 4470766998.0, + "step": 8745 + }, + { + "epoch": 2.3650621957815035, + "grad_norm": 2.9375, + "learning_rate": 0.01222193475273312, + "loss": 3.2403, + "mean_token_accuracy": 0.38802164793014526, + "num_tokens": 4471291197.0, + "step": 8746 + }, + { + "epoch": 2.365332612222823, + "grad_norm": 3.0625, + "learning_rate": 0.012220372842590204, + "loss": 3.1995, + "mean_token_accuracy": 0.3874503970146179, + "num_tokens": 4471773172.0, + "step": 8747 + }, + { + "epoch": 2.3656030286641427, + "grad_norm": 4.125, + "learning_rate": 0.012218810895002666, + "loss": 3.1533, + "mean_token_accuracy": 0.396801233291626, + "num_tokens": 4472289596.0, + "step": 8748 + }, + { + "epoch": 2.3658734451054624, + "grad_norm": 2.625, + "learning_rate": 0.012217248910018423, + "loss": 2.9187, + "mean_token_accuracy": 0.410908579826355, + "num_tokens": 4472781312.0, + "step": 8749 + }, + { + "epoch": 2.366143861546782, + "grad_norm": 2.734375, + "learning_rate": 0.012215686887685403, + "loss": 3.1589, + "mean_token_accuracy": 0.3887554109096527, + "num_tokens": 4473305588.0, + "step": 8750 + }, + { + "epoch": 2.3664142779881017, + "grad_norm": 13.9375, + "learning_rate": 0.012214124828051537, + "loss": 12.3003, + "mean_token_accuracy": 0.011119754053652287, + "num_tokens": 4473829844.0, + "step": 8751 + }, + { + "epoch": 2.3666846944294213, + "grad_norm": 6.34375, + "learning_rate": 0.01221256273116475, + "loss": 3.3354, + "mean_token_accuracy": 0.3553789258003235, + "num_tokens": 4474353915.0, + "step": 8752 + }, + { + "epoch": 2.366955110870741, + "grad_norm": 1.9453125, + "learning_rate": 0.012211000597072967, + "loss": 3.2219, + "mean_token_accuracy": 0.38266661763191223, + "num_tokens": 4474878094.0, + "step": 8753 + }, + { + "epoch": 2.3672255273120606, + "grad_norm": 2.953125, + "learning_rate": 0.012209438425824129, + "loss": 2.9179, + "mean_token_accuracy": 0.41562527418136597, + "num_tokens": 4475402215.0, + "step": 8754 + }, + { + "epoch": 2.3674959437533802, + "grad_norm": 2.828125, + "learning_rate": 0.012207876217466163, + "loss": 3.2124, + "mean_token_accuracy": 0.41351962089538574, + "num_tokens": 4475926314.0, + "step": 8755 + }, + { + "epoch": 2.3677663601947, + "grad_norm": 3.234375, + "learning_rate": 0.012206313972047004, + "loss": 3.1685, + "mean_token_accuracy": 0.4007457196712494, + "num_tokens": 4476428550.0, + "step": 8756 + }, + { + "epoch": 2.3680367766360195, + "grad_norm": 2.828125, + "learning_rate": 0.012204751689614582, + "loss": 3.1382, + "mean_token_accuracy": 0.40571826696395874, + "num_tokens": 4476903303.0, + "step": 8757 + }, + { + "epoch": 2.368307193077339, + "grad_norm": 2.765625, + "learning_rate": 0.012203189370216836, + "loss": 3.2942, + "mean_token_accuracy": 0.40755289793014526, + "num_tokens": 4477410351.0, + "step": 8758 + }, + { + "epoch": 2.368577609518659, + "grad_norm": 2.78125, + "learning_rate": 0.012201627013901702, + "loss": 3.3314, + "mean_token_accuracy": 0.3764188885688782, + "num_tokens": 4477934615.0, + "step": 8759 + }, + { + "epoch": 2.3688480259599785, + "grad_norm": 2.671875, + "learning_rate": 0.012200064620717117, + "loss": 3.2122, + "mean_token_accuracy": 0.40933436155319214, + "num_tokens": 4478458886.0, + "step": 8760 + }, + { + "epoch": 2.369118442401298, + "grad_norm": 3.1875, + "learning_rate": 0.012198502190711018, + "loss": 3.2714, + "mean_token_accuracy": 0.3709351420402527, + "num_tokens": 4478983151.0, + "step": 8761 + }, + { + "epoch": 2.3693888588426177, + "grad_norm": 3.53125, + "learning_rate": 0.012196939723931353, + "loss": 3.2053, + "mean_token_accuracy": 0.3831919729709625, + "num_tokens": 4479507275.0, + "step": 8762 + }, + { + "epoch": 2.3696592752839374, + "grad_norm": 2.75, + "learning_rate": 0.012195377220426054, + "loss": 3.0651, + "mean_token_accuracy": 0.40545010566711426, + "num_tokens": 4479973504.0, + "step": 8763 + }, + { + "epoch": 2.369929691725257, + "grad_norm": 2.90625, + "learning_rate": 0.012193814680243067, + "loss": 3.2752, + "mean_token_accuracy": 0.3783584237098694, + "num_tokens": 4480478864.0, + "step": 8764 + }, + { + "epoch": 2.3702001081665767, + "grad_norm": 2.84375, + "learning_rate": 0.012192252103430334, + "loss": 3.0351, + "mean_token_accuracy": 0.3975718021392822, + "num_tokens": 4481003043.0, + "step": 8765 + }, + { + "epoch": 2.3704705246078963, + "grad_norm": 2.78125, + "learning_rate": 0.012190689490035803, + "loss": 3.0968, + "mean_token_accuracy": 0.3771992325782776, + "num_tokens": 4481527269.0, + "step": 8766 + }, + { + "epoch": 2.370740941049216, + "grad_norm": 2.75, + "learning_rate": 0.012189126840107414, + "loss": 3.3166, + "mean_token_accuracy": 0.3847626745700836, + "num_tokens": 4482051447.0, + "step": 8767 + }, + { + "epoch": 2.3710113574905356, + "grad_norm": 2.875, + "learning_rate": 0.012187564153693119, + "loss": 3.1092, + "mean_token_accuracy": 0.396608829498291, + "num_tokens": 4482575604.0, + "step": 8768 + }, + { + "epoch": 2.3712817739318552, + "grad_norm": 2.09375, + "learning_rate": 0.012186001430840861, + "loss": 3.027, + "mean_token_accuracy": 0.42534971237182617, + "num_tokens": 4483099740.0, + "step": 8769 + }, + { + "epoch": 2.3715521903731744, + "grad_norm": 2.671875, + "learning_rate": 0.01218443867159859, + "loss": 3.1403, + "mean_token_accuracy": 0.39059701561927795, + "num_tokens": 4483623995.0, + "step": 8770 + }, + { + "epoch": 2.3718226068144945, + "grad_norm": 278.0, + "learning_rate": 0.012182875876014259, + "loss": 24.5262, + "mean_token_accuracy": 0.0, + "num_tokens": 4484148205.0, + "step": 8771 + }, + { + "epoch": 2.3720930232558137, + "grad_norm": 6.65625, + "learning_rate": 0.01218131304413582, + "loss": 3.4468, + "mean_token_accuracy": 0.37642547488212585, + "num_tokens": 4484672441.0, + "step": 8772 + }, + { + "epoch": 2.372363439697134, + "grad_norm": 3.296875, + "learning_rate": 0.012179750176011223, + "loss": 3.071, + "mean_token_accuracy": 0.39915651082992554, + "num_tokens": 4485145173.0, + "step": 8773 + }, + { + "epoch": 2.372633856138453, + "grad_norm": 2.53125, + "learning_rate": 0.012178187271688417, + "loss": 3.0864, + "mean_token_accuracy": 0.38180452585220337, + "num_tokens": 4485669421.0, + "step": 8774 + }, + { + "epoch": 2.372904272579773, + "grad_norm": 3.125, + "learning_rate": 0.01217662433121536, + "loss": 3.0812, + "mean_token_accuracy": 0.3894941210746765, + "num_tokens": 4486193661.0, + "step": 8775 + }, + { + "epoch": 2.3731746890210923, + "grad_norm": 2.5, + "learning_rate": 0.012175061354640011, + "loss": 2.7654, + "mean_token_accuracy": 0.4030193090438843, + "num_tokens": 4486717917.0, + "step": 8776 + }, + { + "epoch": 2.373445105462412, + "grad_norm": 2.0625, + "learning_rate": 0.012173498342010326, + "loss": 3.1526, + "mean_token_accuracy": 0.4088263511657715, + "num_tokens": 4487225918.0, + "step": 8777 + }, + { + "epoch": 2.3737155219037316, + "grad_norm": 2.4375, + "learning_rate": 0.012171935293374259, + "loss": 2.9254, + "mean_token_accuracy": 0.4126051068305969, + "num_tokens": 4487750179.0, + "step": 8778 + }, + { + "epoch": 2.3739859383450512, + "grad_norm": 2.234375, + "learning_rate": 0.01217037220877977, + "loss": 3.0017, + "mean_token_accuracy": 0.4074217975139618, + "num_tokens": 4488230940.0, + "step": 8779 + }, + { + "epoch": 2.374256354786371, + "grad_norm": 2.28125, + "learning_rate": 0.012168809088274819, + "loss": 2.896, + "mean_token_accuracy": 0.4118598699569702, + "num_tokens": 4488755191.0, + "step": 8780 + }, + { + "epoch": 2.3745267712276905, + "grad_norm": 2.828125, + "learning_rate": 0.012167245931907369, + "loss": 3.1584, + "mean_token_accuracy": 0.3983590006828308, + "num_tokens": 4489279447.0, + "step": 8781 + }, + { + "epoch": 2.37479718766901, + "grad_norm": 2.5625, + "learning_rate": 0.01216568273972538, + "loss": 3.0155, + "mean_token_accuracy": 0.41034457087516785, + "num_tokens": 4489795855.0, + "step": 8782 + }, + { + "epoch": 2.37506760411033, + "grad_norm": 4.65625, + "learning_rate": 0.012164119511776818, + "loss": 3.07, + "mean_token_accuracy": 0.43730002641677856, + "num_tokens": 4490297910.0, + "step": 8783 + }, + { + "epoch": 2.3753380205516494, + "grad_norm": 2.015625, + "learning_rate": 0.012162556248109647, + "loss": 3.0787, + "mean_token_accuracy": 0.3933052122592926, + "num_tokens": 4490822105.0, + "step": 8784 + }, + { + "epoch": 2.375608436992969, + "grad_norm": 2.984375, + "learning_rate": 0.012160992948771833, + "loss": 2.9723, + "mean_token_accuracy": 0.41708287596702576, + "num_tokens": 4491324926.0, + "step": 8785 + }, + { + "epoch": 2.3758788534342887, + "grad_norm": 2.8125, + "learning_rate": 0.012159429613811336, + "loss": 3.0504, + "mean_token_accuracy": 0.41032925248146057, + "num_tokens": 4491849065.0, + "step": 8786 + }, + { + "epoch": 2.3761492698756084, + "grad_norm": 2.65625, + "learning_rate": 0.01215786624327613, + "loss": 3.0229, + "mean_token_accuracy": 0.3921457529067993, + "num_tokens": 4492373256.0, + "step": 8787 + }, + { + "epoch": 2.376419686316928, + "grad_norm": 2.59375, + "learning_rate": 0.012156302837214186, + "loss": 3.142, + "mean_token_accuracy": 0.3935674726963043, + "num_tokens": 4492897479.0, + "step": 8788 + }, + { + "epoch": 2.3766901027582477, + "grad_norm": 2.640625, + "learning_rate": 0.012154739395673466, + "loss": 3.0325, + "mean_token_accuracy": 0.4070674777030945, + "num_tokens": 4493421654.0, + "step": 8789 + }, + { + "epoch": 2.3769605191995673, + "grad_norm": 2.796875, + "learning_rate": 0.012153175918701947, + "loss": 2.8836, + "mean_token_accuracy": 0.41190165281295776, + "num_tokens": 4493945899.0, + "step": 8790 + }, + { + "epoch": 2.377230935640887, + "grad_norm": 13.625, + "learning_rate": 0.0121516124063476, + "loss": 10.7461, + "mean_token_accuracy": 0.009270399808883667, + "num_tokens": 4494421671.0, + "step": 8791 + }, + { + "epoch": 2.3775013520822066, + "grad_norm": 7.625, + "learning_rate": 0.0121500488586584, + "loss": 3.3779, + "mean_token_accuracy": 0.35918015241622925, + "num_tokens": 4494945756.0, + "step": 8792 + }, + { + "epoch": 2.3777717685235262, + "grad_norm": 2.09375, + "learning_rate": 0.012148485275682314, + "loss": 3.1633, + "mean_token_accuracy": 0.3929985463619232, + "num_tokens": 4495469984.0, + "step": 8793 + }, + { + "epoch": 2.378042184964846, + "grad_norm": 4.21875, + "learning_rate": 0.012146921657467327, + "loss": 3.503, + "mean_token_accuracy": 0.35592225193977356, + "num_tokens": 4495983376.0, + "step": 8794 + }, + { + "epoch": 2.3783126014061655, + "grad_norm": 3.265625, + "learning_rate": 0.01214535800406141, + "loss": 3.0878, + "mean_token_accuracy": 0.42648106813430786, + "num_tokens": 4496507626.0, + "step": 8795 + }, + { + "epoch": 2.378583017847485, + "grad_norm": 2.484375, + "learning_rate": 0.012143794315512537, + "loss": 2.956, + "mean_token_accuracy": 0.4100949764251709, + "num_tokens": 4497031817.0, + "step": 8796 + }, + { + "epoch": 2.378853434288805, + "grad_norm": 3.15625, + "learning_rate": 0.012142230591868693, + "loss": 2.7685, + "mean_token_accuracy": 0.4163888096809387, + "num_tokens": 4497555959.0, + "step": 8797 + }, + { + "epoch": 2.3791238507301244, + "grad_norm": 2.6875, + "learning_rate": 0.012140666833177858, + "loss": 3.0104, + "mean_token_accuracy": 0.40893107652664185, + "num_tokens": 4498080080.0, + "step": 8798 + }, + { + "epoch": 2.379394267171444, + "grad_norm": 3.546875, + "learning_rate": 0.012139103039488008, + "loss": 3.3571, + "mean_token_accuracy": 0.3935757875442505, + "num_tokens": 4498604268.0, + "step": 8799 + }, + { + "epoch": 2.3796646836127637, + "grad_norm": 2.828125, + "learning_rate": 0.012137539210847129, + "loss": 3.052, + "mean_token_accuracy": 0.4003549814224243, + "num_tokens": 4499128448.0, + "step": 8800 + }, + { + "epoch": 2.3799351000540834, + "grad_norm": 3.90625, + "learning_rate": 0.012135975347303199, + "loss": 3.119, + "mean_token_accuracy": 0.41361290216445923, + "num_tokens": 4499652677.0, + "step": 8801 + }, + { + "epoch": 2.380205516495403, + "grad_norm": 3.09375, + "learning_rate": 0.012134411448904203, + "loss": 2.907, + "mean_token_accuracy": 0.41693639755249023, + "num_tokens": 4500176903.0, + "step": 8802 + }, + { + "epoch": 2.3804759329367227, + "grad_norm": 2.953125, + "learning_rate": 0.01213284751569813, + "loss": 3.3877, + "mean_token_accuracy": 0.38868874311447144, + "num_tokens": 4500701185.0, + "step": 8803 + }, + { + "epoch": 2.3807463493780423, + "grad_norm": 2.8125, + "learning_rate": 0.012131283547732963, + "loss": 3.0832, + "mean_token_accuracy": 0.38455450534820557, + "num_tokens": 4501225228.0, + "step": 8804 + }, + { + "epoch": 2.381016765819362, + "grad_norm": 3.09375, + "learning_rate": 0.012129719545056691, + "loss": 3.1201, + "mean_token_accuracy": 0.37651753425598145, + "num_tokens": 4501749362.0, + "step": 8805 + }, + { + "epoch": 2.3812871822606816, + "grad_norm": 2.078125, + "learning_rate": 0.0121281555077173, + "loss": 3.1463, + "mean_token_accuracy": 0.4033181071281433, + "num_tokens": 4502244864.0, + "step": 8806 + }, + { + "epoch": 2.3815575987020012, + "grad_norm": 3.21875, + "learning_rate": 0.012126591435762784, + "loss": 3.2967, + "mean_token_accuracy": 0.4112316071987152, + "num_tokens": 4502680994.0, + "step": 8807 + }, + { + "epoch": 2.381828015143321, + "grad_norm": 2.46875, + "learning_rate": 0.012125027329241127, + "loss": 3.1879, + "mean_token_accuracy": 0.36544716358184814, + "num_tokens": 4503205278.0, + "step": 8808 + }, + { + "epoch": 2.3820984315846405, + "grad_norm": 2.84375, + "learning_rate": 0.012123463188200321, + "loss": 3.0665, + "mean_token_accuracy": 0.4097039997577667, + "num_tokens": 4503729554.0, + "step": 8809 + }, + { + "epoch": 2.38236884802596, + "grad_norm": 3.265625, + "learning_rate": 0.012121899012688364, + "loss": 3.1027, + "mean_token_accuracy": 0.3997608721256256, + "num_tokens": 4504253736.0, + "step": 8810 + }, + { + "epoch": 2.3826392644672794, + "grad_norm": 28.75, + "learning_rate": 0.012120334802753244, + "loss": 19.8774, + "mean_token_accuracy": 0.0, + "num_tokens": 4504778006.0, + "step": 8811 + }, + { + "epoch": 2.3829096809085994, + "grad_norm": 5.875, + "learning_rate": 0.012118770558442956, + "loss": 3.6132, + "mean_token_accuracy": 0.35388532280921936, + "num_tokens": 4505302258.0, + "step": 8812 + }, + { + "epoch": 2.3831800973499186, + "grad_norm": 2.359375, + "learning_rate": 0.0121172062798055, + "loss": 3.0916, + "mean_token_accuracy": 0.3733377754688263, + "num_tokens": 4505826491.0, + "step": 8813 + }, + { + "epoch": 2.3834505137912387, + "grad_norm": 2.734375, + "learning_rate": 0.012115641966888866, + "loss": 3.1969, + "mean_token_accuracy": 0.36925792694091797, + "num_tokens": 4506350682.0, + "step": 8814 + }, + { + "epoch": 2.383720930232558, + "grad_norm": 2.640625, + "learning_rate": 0.01211407761974106, + "loss": 3.1596, + "mean_token_accuracy": 0.38784441351890564, + "num_tokens": 4506874959.0, + "step": 8815 + }, + { + "epoch": 2.383991346673878, + "grad_norm": 2.640625, + "learning_rate": 0.012112513238410075, + "loss": 3.157, + "mean_token_accuracy": 0.39035093784332275, + "num_tokens": 4507398967.0, + "step": 8816 + }, + { + "epoch": 2.384261763115197, + "grad_norm": 2.71875, + "learning_rate": 0.012110948822943913, + "loss": 3.2054, + "mean_token_accuracy": 0.4062073230743408, + "num_tokens": 4507905517.0, + "step": 8817 + }, + { + "epoch": 2.384532179556517, + "grad_norm": 3.3125, + "learning_rate": 0.012109384373390571, + "loss": 3.1531, + "mean_token_accuracy": 0.39226067066192627, + "num_tokens": 4508395619.0, + "step": 8818 + }, + { + "epoch": 2.3848025959978365, + "grad_norm": 2.875, + "learning_rate": 0.012107819889798055, + "loss": 2.9616, + "mean_token_accuracy": 0.39792606234550476, + "num_tokens": 4508919867.0, + "step": 8819 + }, + { + "epoch": 2.385073012439156, + "grad_norm": 2.8125, + "learning_rate": 0.012106255372214367, + "loss": 3.1129, + "mean_token_accuracy": 0.40392327308654785, + "num_tokens": 4509380359.0, + "step": 8820 + }, + { + "epoch": 2.385343428880476, + "grad_norm": 2.859375, + "learning_rate": 0.012104690820687514, + "loss": 3.4024, + "mean_token_accuracy": 0.36248892545700073, + "num_tokens": 4509904581.0, + "step": 8821 + }, + { + "epoch": 2.3856138453217954, + "grad_norm": 2.78125, + "learning_rate": 0.012103126235265495, + "loss": 3.0202, + "mean_token_accuracy": 0.4035969376564026, + "num_tokens": 4510428684.0, + "step": 8822 + }, + { + "epoch": 2.385884261763115, + "grad_norm": 2.875, + "learning_rate": 0.01210156161599632, + "loss": 3.0133, + "mean_token_accuracy": 0.4071378707885742, + "num_tokens": 4510952955.0, + "step": 8823 + }, + { + "epoch": 2.3861546782044347, + "grad_norm": 3.390625, + "learning_rate": 0.012099996962927994, + "loss": 3.2413, + "mean_token_accuracy": 0.3897998034954071, + "num_tokens": 4511477227.0, + "step": 8824 + }, + { + "epoch": 2.3864250946457544, + "grad_norm": 3.40625, + "learning_rate": 0.01209843227610853, + "loss": 3.2695, + "mean_token_accuracy": 0.3743423819541931, + "num_tokens": 4512001415.0, + "step": 8825 + }, + { + "epoch": 2.386695511087074, + "grad_norm": 2.609375, + "learning_rate": 0.012096867555585927, + "loss": 3.0275, + "mean_token_accuracy": 0.41438379883766174, + "num_tokens": 4512525636.0, + "step": 8826 + }, + { + "epoch": 2.3869659275283936, + "grad_norm": 3.375, + "learning_rate": 0.012095302801408206, + "loss": 3.2134, + "mean_token_accuracy": 0.38715457916259766, + "num_tokens": 4513049846.0, + "step": 8827 + }, + { + "epoch": 2.3872363439697133, + "grad_norm": 2.765625, + "learning_rate": 0.012093738013623375, + "loss": 2.8905, + "mean_token_accuracy": 0.4132755398750305, + "num_tokens": 4513574123.0, + "step": 8828 + }, + { + "epoch": 2.387506760411033, + "grad_norm": 2.4375, + "learning_rate": 0.01209217319227944, + "loss": 3.2233, + "mean_token_accuracy": 0.3815491199493408, + "num_tokens": 4514045742.0, + "step": 8829 + }, + { + "epoch": 2.3877771768523526, + "grad_norm": 2.625, + "learning_rate": 0.012090608337424423, + "loss": 3.0736, + "mean_token_accuracy": 0.40543925762176514, + "num_tokens": 4514518033.0, + "step": 8830 + }, + { + "epoch": 2.388047593293672, + "grad_norm": 5.9375, + "learning_rate": 0.012089043449106336, + "loss": 10.8244, + "mean_token_accuracy": 0.02145358920097351, + "num_tokens": 4515042050.0, + "step": 8831 + }, + { + "epoch": 2.388318009734992, + "grad_norm": 6.59375, + "learning_rate": 0.012087478527373192, + "loss": 3.4439, + "mean_token_accuracy": 0.33043214678764343, + "num_tokens": 4515566241.0, + "step": 8832 + }, + { + "epoch": 2.3885884261763115, + "grad_norm": 2.015625, + "learning_rate": 0.012085913572273009, + "loss": 2.9475, + "mean_token_accuracy": 0.40077298879623413, + "num_tokens": 4516090005.0, + "step": 8833 + }, + { + "epoch": 2.388858842617631, + "grad_norm": 2.046875, + "learning_rate": 0.012084348583853806, + "loss": 3.184, + "mean_token_accuracy": 0.376194566488266, + "num_tokens": 4516614132.0, + "step": 8834 + }, + { + "epoch": 2.389129259058951, + "grad_norm": 2.765625, + "learning_rate": 0.012082783562163597, + "loss": 3.1458, + "mean_token_accuracy": 0.4000105857849121, + "num_tokens": 4517138243.0, + "step": 8835 + }, + { + "epoch": 2.3893996755002704, + "grad_norm": 2.375, + "learning_rate": 0.012081218507250404, + "loss": 3.1971, + "mean_token_accuracy": 0.3877757787704468, + "num_tokens": 4517618635.0, + "step": 8836 + }, + { + "epoch": 2.38967009194159, + "grad_norm": 2.671875, + "learning_rate": 0.01207965341916225, + "loss": 3.1525, + "mean_token_accuracy": 0.39812731742858887, + "num_tokens": 4518142883.0, + "step": 8837 + }, + { + "epoch": 2.3899405083829097, + "grad_norm": 3.140625, + "learning_rate": 0.01207808829794715, + "loss": 3.2372, + "mean_token_accuracy": 0.39824992418289185, + "num_tokens": 4518666865.0, + "step": 8838 + }, + { + "epoch": 2.3902109248242294, + "grad_norm": 3.359375, + "learning_rate": 0.012076523143653133, + "loss": 3.2121, + "mean_token_accuracy": 0.37761518359184265, + "num_tokens": 4519191026.0, + "step": 8839 + }, + { + "epoch": 2.390481341265549, + "grad_norm": 2.65625, + "learning_rate": 0.01207495795632822, + "loss": 3.0271, + "mean_token_accuracy": 0.4188137650489807, + "num_tokens": 4519715273.0, + "step": 8840 + }, + { + "epoch": 2.3907517577068687, + "grad_norm": 2.578125, + "learning_rate": 0.012073392736020436, + "loss": 3.0783, + "mean_token_accuracy": 0.4156531095504761, + "num_tokens": 4520195844.0, + "step": 8841 + }, + { + "epoch": 2.3910221741481883, + "grad_norm": 3.375, + "learning_rate": 0.012071827482777806, + "loss": 3.372, + "mean_token_accuracy": 0.38778436183929443, + "num_tokens": 4520689918.0, + "step": 8842 + }, + { + "epoch": 2.391292590589508, + "grad_norm": 3.546875, + "learning_rate": 0.012070262196648356, + "loss": 3.2761, + "mean_token_accuracy": 0.3864763379096985, + "num_tokens": 4521194113.0, + "step": 8843 + }, + { + "epoch": 2.3915630070308276, + "grad_norm": 7.03125, + "learning_rate": 0.012068696877680116, + "loss": 3.1781, + "mean_token_accuracy": 0.4110400080680847, + "num_tokens": 4521718281.0, + "step": 8844 + }, + { + "epoch": 2.3918334234721472, + "grad_norm": 1.5625, + "learning_rate": 0.01206713152592111, + "loss": 3.1636, + "mean_token_accuracy": 0.3875918388366699, + "num_tokens": 4522242541.0, + "step": 8845 + }, + { + "epoch": 2.392103839913467, + "grad_norm": 3.375, + "learning_rate": 0.012065566141419373, + "loss": 3.3103, + "mean_token_accuracy": 0.3823399841785431, + "num_tokens": 4522738501.0, + "step": 8846 + }, + { + "epoch": 2.3923742563547865, + "grad_norm": 2.859375, + "learning_rate": 0.012064000724222935, + "loss": 3.1248, + "mean_token_accuracy": 0.3927772343158722, + "num_tokens": 4523262571.0, + "step": 8847 + }, + { + "epoch": 2.392644672796106, + "grad_norm": 2.9375, + "learning_rate": 0.012062435274379824, + "loss": 3.275, + "mean_token_accuracy": 0.4028821587562561, + "num_tokens": 4523741785.0, + "step": 8848 + }, + { + "epoch": 2.392915089237426, + "grad_norm": 2.796875, + "learning_rate": 0.012060869791938073, + "loss": 3.1633, + "mean_token_accuracy": 0.4225374162197113, + "num_tokens": 4524182124.0, + "step": 8849 + }, + { + "epoch": 2.3931855056787454, + "grad_norm": 3.359375, + "learning_rate": 0.012059304276945718, + "loss": 3.1734, + "mean_token_accuracy": 0.3603057265281677, + "num_tokens": 4524706396.0, + "step": 8850 + }, + { + "epoch": 2.393455922120065, + "grad_norm": 5.90625, + "learning_rate": 0.012057738729450793, + "loss": 11.8002, + "mean_token_accuracy": 0.0, + "num_tokens": 4525144761.0, + "step": 8851 + }, + { + "epoch": 2.3937263385613847, + "grad_norm": 11.375, + "learning_rate": 0.012056173149501333, + "loss": 3.4094, + "mean_token_accuracy": 0.3789018988609314, + "num_tokens": 4525669003.0, + "step": 8852 + }, + { + "epoch": 2.3939967550027044, + "grad_norm": 2.546875, + "learning_rate": 0.012054607537145372, + "loss": 3.0489, + "mean_token_accuracy": 0.40766581892967224, + "num_tokens": 4526193152.0, + "step": 8853 + }, + { + "epoch": 2.3942671714440236, + "grad_norm": 2.421875, + "learning_rate": 0.012053041892430954, + "loss": 3.1857, + "mean_token_accuracy": 0.38638824224472046, + "num_tokens": 4526717351.0, + "step": 8854 + }, + { + "epoch": 2.3945375878853437, + "grad_norm": 3.390625, + "learning_rate": 0.012051476215406112, + "loss": 3.2293, + "mean_token_accuracy": 0.3798372149467468, + "num_tokens": 4527241549.0, + "step": 8855 + }, + { + "epoch": 2.394808004326663, + "grad_norm": 3.59375, + "learning_rate": 0.012049910506118887, + "loss": 3.0551, + "mean_token_accuracy": 0.37371930480003357, + "num_tokens": 4527765656.0, + "step": 8856 + }, + { + "epoch": 2.395078420767983, + "grad_norm": 2.90625, + "learning_rate": 0.012048344764617325, + "loss": 3.154, + "mean_token_accuracy": 0.403475821018219, + "num_tokens": 4528289758.0, + "step": 8857 + }, + { + "epoch": 2.395348837209302, + "grad_norm": 2.890625, + "learning_rate": 0.012046778990949455, + "loss": 3.1506, + "mean_token_accuracy": 0.3903481066226959, + "num_tokens": 4528768954.0, + "step": 8858 + }, + { + "epoch": 2.395619253650622, + "grad_norm": 2.578125, + "learning_rate": 0.012045213185163333, + "loss": 3.0958, + "mean_token_accuracy": 0.40705227851867676, + "num_tokens": 4529293143.0, + "step": 8859 + }, + { + "epoch": 2.3958896700919414, + "grad_norm": 3.203125, + "learning_rate": 0.012043647347306993, + "loss": 3.2387, + "mean_token_accuracy": 0.3771897554397583, + "num_tokens": 4529817397.0, + "step": 8860 + }, + { + "epoch": 2.396160086533261, + "grad_norm": 2.78125, + "learning_rate": 0.012042081477428485, + "loss": 3.3185, + "mean_token_accuracy": 0.38932883739471436, + "num_tokens": 4530284368.0, + "step": 8861 + }, + { + "epoch": 2.3964305029745807, + "grad_norm": 2.875, + "learning_rate": 0.012040515575575853, + "loss": 3.0737, + "mean_token_accuracy": 0.36496323347091675, + "num_tokens": 4530808538.0, + "step": 8862 + }, + { + "epoch": 2.3967009194159004, + "grad_norm": 2.53125, + "learning_rate": 0.012038949641797143, + "loss": 3.2729, + "mean_token_accuracy": 0.3804236054420471, + "num_tokens": 4531332802.0, + "step": 8863 + }, + { + "epoch": 2.39697133585722, + "grad_norm": 3.40625, + "learning_rate": 0.012037383676140403, + "loss": 3.16, + "mean_token_accuracy": 0.39829617738723755, + "num_tokens": 4531857080.0, + "step": 8864 + }, + { + "epoch": 2.3972417522985396, + "grad_norm": 2.78125, + "learning_rate": 0.012035817678653679, + "loss": 3.1304, + "mean_token_accuracy": 0.4127153158187866, + "num_tokens": 4532381259.0, + "step": 8865 + }, + { + "epoch": 2.3975121687398593, + "grad_norm": 3.15625, + "learning_rate": 0.012034251649385026, + "loss": 3.0719, + "mean_token_accuracy": 0.3994295001029968, + "num_tokens": 4532905493.0, + "step": 8866 + }, + { + "epoch": 2.397782585181179, + "grad_norm": 2.78125, + "learning_rate": 0.012032685588382488, + "loss": 2.9527, + "mean_token_accuracy": 0.42652058601379395, + "num_tokens": 4533429575.0, + "step": 8867 + }, + { + "epoch": 2.3980530016224986, + "grad_norm": 2.6875, + "learning_rate": 0.012031119495694121, + "loss": 3.1991, + "mean_token_accuracy": 0.409338116645813, + "num_tokens": 4533913311.0, + "step": 8868 + }, + { + "epoch": 2.398323418063818, + "grad_norm": 2.953125, + "learning_rate": 0.012029553371367975, + "loss": 3.1988, + "mean_token_accuracy": 0.41841524839401245, + "num_tokens": 4534406230.0, + "step": 8869 + }, + { + "epoch": 2.398593834505138, + "grad_norm": 2.640625, + "learning_rate": 0.012027987215452103, + "loss": 3.1236, + "mean_token_accuracy": 0.39535781741142273, + "num_tokens": 4534930436.0, + "step": 8870 + }, + { + "epoch": 2.3988642509464575, + "grad_norm": 61.5, + "learning_rate": 0.01202642102799456, + "loss": 11.379, + "mean_token_accuracy": 0.014817794784903526, + "num_tokens": 4535454617.0, + "step": 8871 + }, + { + "epoch": 2.399134667387777, + "grad_norm": 6.5, + "learning_rate": 0.012024854809043403, + "loss": 3.448, + "mean_token_accuracy": 0.34985265135765076, + "num_tokens": 4535972787.0, + "step": 8872 + }, + { + "epoch": 2.399405083829097, + "grad_norm": 2.34375, + "learning_rate": 0.012023288558646686, + "loss": 3.3834, + "mean_token_accuracy": 0.3789302408695221, + "num_tokens": 4536496977.0, + "step": 8873 + }, + { + "epoch": 2.3996755002704164, + "grad_norm": 4.21875, + "learning_rate": 0.012021722276852465, + "loss": 3.3016, + "mean_token_accuracy": 0.39595654606819153, + "num_tokens": 4536960707.0, + "step": 8874 + }, + { + "epoch": 2.399945916711736, + "grad_norm": 2.96875, + "learning_rate": 0.012020155963708801, + "loss": 3.1573, + "mean_token_accuracy": 0.3909924626350403, + "num_tokens": 4537477222.0, + "step": 8875 + }, + { + "epoch": 2.4002163331530557, + "grad_norm": 3.359375, + "learning_rate": 0.012018589619263751, + "loss": 3.1747, + "mean_token_accuracy": 0.380426287651062, + "num_tokens": 4538001450.0, + "step": 8876 + }, + { + "epoch": 2.4004867495943754, + "grad_norm": 2.703125, + "learning_rate": 0.012017023243565381, + "loss": 2.9557, + "mean_token_accuracy": 0.4106066823005676, + "num_tokens": 4538525628.0, + "step": 8877 + }, + { + "epoch": 2.400757166035695, + "grad_norm": 3.453125, + "learning_rate": 0.01201545683666174, + "loss": 3.2775, + "mean_token_accuracy": 0.38330382108688354, + "num_tokens": 4539018720.0, + "step": 8878 + }, + { + "epoch": 2.4010275824770146, + "grad_norm": 2.078125, + "learning_rate": 0.0120138903986009, + "loss": 3.0991, + "mean_token_accuracy": 0.40558385848999023, + "num_tokens": 4539488169.0, + "step": 8879 + }, + { + "epoch": 2.4012979989183343, + "grad_norm": 2.609375, + "learning_rate": 0.012012323929430925, + "loss": 3.0972, + "mean_token_accuracy": 0.4076525568962097, + "num_tokens": 4539992685.0, + "step": 8880 + }, + { + "epoch": 2.401568415359654, + "grad_norm": 2.359375, + "learning_rate": 0.012010757429199865, + "loss": 3.1166, + "mean_token_accuracy": 0.41566896438598633, + "num_tokens": 4540490689.0, + "step": 8881 + }, + { + "epoch": 2.4018388318009736, + "grad_norm": 2.328125, + "learning_rate": 0.012009190897955801, + "loss": 3.1113, + "mean_token_accuracy": 0.41560620069503784, + "num_tokens": 4540961092.0, + "step": 8882 + }, + { + "epoch": 2.402109248242293, + "grad_norm": 2.671875, + "learning_rate": 0.012007624335746791, + "loss": 3.1281, + "mean_token_accuracy": 0.3862532377243042, + "num_tokens": 4541485282.0, + "step": 8883 + }, + { + "epoch": 2.402379664683613, + "grad_norm": 3.578125, + "learning_rate": 0.012006057742620906, + "loss": 3.2065, + "mean_token_accuracy": 0.3849722445011139, + "num_tokens": 4542009557.0, + "step": 8884 + }, + { + "epoch": 2.4026500811249325, + "grad_norm": 3.296875, + "learning_rate": 0.01200449111862621, + "loss": 2.9606, + "mean_token_accuracy": 0.4078431725502014, + "num_tokens": 4542468946.0, + "step": 8885 + }, + { + "epoch": 2.402920497566252, + "grad_norm": 21.5, + "learning_rate": 0.012002924463810771, + "loss": 3.1737, + "mean_token_accuracy": 0.3979213535785675, + "num_tokens": 4542937566.0, + "step": 8886 + }, + { + "epoch": 2.403190914007572, + "grad_norm": 4.8125, + "learning_rate": 0.01200135777822266, + "loss": 2.9642, + "mean_token_accuracy": 0.40090733766555786, + "num_tokens": 4543461821.0, + "step": 8887 + }, + { + "epoch": 2.4034613304488914, + "grad_norm": 2.0625, + "learning_rate": 0.01199979106190995, + "loss": 3.1389, + "mean_token_accuracy": 0.3831251263618469, + "num_tokens": 4543947344.0, + "step": 8888 + }, + { + "epoch": 2.403731746890211, + "grad_norm": 4.09375, + "learning_rate": 0.011998224314920708, + "loss": 3.2023, + "mean_token_accuracy": 0.4109782874584198, + "num_tokens": 4544471557.0, + "step": 8889 + }, + { + "epoch": 2.4040021633315307, + "grad_norm": 2.078125, + "learning_rate": 0.011996657537303007, + "loss": 3.3755, + "mean_token_accuracy": 0.38052648305892944, + "num_tokens": 4544977554.0, + "step": 8890 + }, + { + "epoch": 2.4042725797728504, + "grad_norm": 126.5, + "learning_rate": 0.011995090729104922, + "loss": 15.0862, + "mean_token_accuracy": 8.772955879976507e-06, + "num_tokens": 4545440741.0, + "step": 8891 + }, + { + "epoch": 2.40454299621417, + "grad_norm": 6.15625, + "learning_rate": 0.01199352389037453, + "loss": 3.2576, + "mean_token_accuracy": 0.34501558542251587, + "num_tokens": 4545965023.0, + "step": 8892 + }, + { + "epoch": 2.4048134126554896, + "grad_norm": 3.0, + "learning_rate": 0.011991957021159903, + "loss": 3.4169, + "mean_token_accuracy": 0.3941441774368286, + "num_tokens": 4546405703.0, + "step": 8893 + }, + { + "epoch": 2.4050838290968093, + "grad_norm": 2.40625, + "learning_rate": 0.011990390121509114, + "loss": 3.3439, + "mean_token_accuracy": 0.36787107586860657, + "num_tokens": 4546929942.0, + "step": 8894 + }, + { + "epoch": 2.4053542455381285, + "grad_norm": 3.265625, + "learning_rate": 0.011988823191470247, + "loss": 3.175, + "mean_token_accuracy": 0.4239884614944458, + "num_tokens": 4547389551.0, + "step": 8895 + }, + { + "epoch": 2.4056246619794486, + "grad_norm": 2.953125, + "learning_rate": 0.011987256231091372, + "loss": 3.0407, + "mean_token_accuracy": 0.4138660430908203, + "num_tokens": 4547872498.0, + "step": 8896 + }, + { + "epoch": 2.4058950784207678, + "grad_norm": 3.109375, + "learning_rate": 0.011985689240420578, + "loss": 3.254, + "mean_token_accuracy": 0.36476320028305054, + "num_tokens": 4548396689.0, + "step": 8897 + }, + { + "epoch": 2.406165494862088, + "grad_norm": 3.078125, + "learning_rate": 0.011984122219505936, + "loss": 3.1591, + "mean_token_accuracy": 0.41731899976730347, + "num_tokens": 4548887176.0, + "step": 8898 + }, + { + "epoch": 2.406435911303407, + "grad_norm": 3.453125, + "learning_rate": 0.01198255516839553, + "loss": 3.3486, + "mean_token_accuracy": 0.37830570340156555, + "num_tokens": 4549411446.0, + "step": 8899 + }, + { + "epoch": 2.4067063277447267, + "grad_norm": 2.375, + "learning_rate": 0.011980988087137443, + "loss": 3.0325, + "mean_token_accuracy": 0.3805539608001709, + "num_tokens": 4549935730.0, + "step": 8900 + }, + { + "epoch": 2.4069767441860463, + "grad_norm": 3.25, + "learning_rate": 0.01197942097577976, + "loss": 3.1599, + "mean_token_accuracy": 0.39521926641464233, + "num_tokens": 4550460006.0, + "step": 8901 + }, + { + "epoch": 2.407247160627366, + "grad_norm": 3.34375, + "learning_rate": 0.011977853834370555, + "loss": 3.3347, + "mean_token_accuracy": 0.39384666085243225, + "num_tokens": 4550984140.0, + "step": 8902 + }, + { + "epoch": 2.4075175770686856, + "grad_norm": 2.953125, + "learning_rate": 0.01197628666295792, + "loss": 3.0228, + "mean_token_accuracy": 0.40203428268432617, + "num_tokens": 4551508266.0, + "step": 8903 + }, + { + "epoch": 2.4077879935100053, + "grad_norm": 2.546875, + "learning_rate": 0.011974719461589942, + "loss": 3.1953, + "mean_token_accuracy": 0.4127214550971985, + "num_tokens": 4552032486.0, + "step": 8904 + }, + { + "epoch": 2.408058409951325, + "grad_norm": 2.53125, + "learning_rate": 0.0119731522303147, + "loss": 3.0054, + "mean_token_accuracy": 0.39845699071884155, + "num_tokens": 4552556684.0, + "step": 8905 + }, + { + "epoch": 2.4083288263926446, + "grad_norm": 2.28125, + "learning_rate": 0.011971584969180288, + "loss": 3.0887, + "mean_token_accuracy": 0.40627211332321167, + "num_tokens": 4553080956.0, + "step": 8906 + }, + { + "epoch": 2.408599242833964, + "grad_norm": 2.859375, + "learning_rate": 0.011970017678234798, + "loss": 3.0429, + "mean_token_accuracy": 0.4171024560928345, + "num_tokens": 4553558954.0, + "step": 8907 + }, + { + "epoch": 2.408869659275284, + "grad_norm": 10.0, + "learning_rate": 0.011968450357526306, + "loss": 3.1126, + "mean_token_accuracy": 0.42393797636032104, + "num_tokens": 4554019124.0, + "step": 8908 + }, + { + "epoch": 2.4091400757166035, + "grad_norm": 2.421875, + "learning_rate": 0.011966883007102911, + "loss": 3.0533, + "mean_token_accuracy": 0.3961780071258545, + "num_tokens": 4554543275.0, + "step": 8909 + }, + { + "epoch": 2.409410492157923, + "grad_norm": 2.203125, + "learning_rate": 0.011965315627012701, + "loss": 3.1475, + "mean_token_accuracy": 0.38278794288635254, + "num_tokens": 4555067558.0, + "step": 8910 + }, + { + "epoch": 2.4096809085992428, + "grad_norm": 92.5, + "learning_rate": 0.011963748217303768, + "loss": 14.0626, + "mean_token_accuracy": 0.023078786209225655, + "num_tokens": 4555591795.0, + "step": 8911 + }, + { + "epoch": 2.4099513250405624, + "grad_norm": 6.65625, + "learning_rate": 0.011962180778024207, + "loss": 3.5311, + "mean_token_accuracy": 0.3353724479675293, + "num_tokens": 4556116041.0, + "step": 8912 + }, + { + "epoch": 2.410221741481882, + "grad_norm": 2.375, + "learning_rate": 0.01196061330922211, + "loss": 3.2369, + "mean_token_accuracy": 0.3876342177391052, + "num_tokens": 4556640151.0, + "step": 8913 + }, + { + "epoch": 2.4104921579232017, + "grad_norm": 2.21875, + "learning_rate": 0.011959045810945573, + "loss": 3.1229, + "mean_token_accuracy": 0.39032265543937683, + "num_tokens": 4557164219.0, + "step": 8914 + }, + { + "epoch": 2.4107625743645213, + "grad_norm": 3.421875, + "learning_rate": 0.011957478283242689, + "loss": 3.2654, + "mean_token_accuracy": 0.39903679490089417, + "num_tokens": 4557688353.0, + "step": 8915 + }, + { + "epoch": 2.411032990805841, + "grad_norm": 2.71875, + "learning_rate": 0.011955910726161555, + "loss": 3.0818, + "mean_token_accuracy": 0.3782639503479004, + "num_tokens": 4558212570.0, + "step": 8916 + }, + { + "epoch": 2.4113034072471606, + "grad_norm": 3.1875, + "learning_rate": 0.011954343139750268, + "loss": 3.1085, + "mean_token_accuracy": 0.4296615719795227, + "num_tokens": 4558667403.0, + "step": 8917 + }, + { + "epoch": 2.4115738236884803, + "grad_norm": 3.0, + "learning_rate": 0.011952775524056927, + "loss": 3.206, + "mean_token_accuracy": 0.3736034035682678, + "num_tokens": 4559191613.0, + "step": 8918 + }, + { + "epoch": 2.4118442401298, + "grad_norm": 11.0625, + "learning_rate": 0.01195120787912963, + "loss": 3.3378, + "mean_token_accuracy": 0.3867376744747162, + "num_tokens": 4559657792.0, + "step": 8919 + }, + { + "epoch": 2.4121146565711196, + "grad_norm": 2.359375, + "learning_rate": 0.011949640205016486, + "loss": 3.2375, + "mean_token_accuracy": 0.3963721990585327, + "num_tokens": 4560181991.0, + "step": 8920 + }, + { + "epoch": 2.412385073012439, + "grad_norm": 3.125, + "learning_rate": 0.01194807250176558, + "loss": 3.0254, + "mean_token_accuracy": 0.4287392497062683, + "num_tokens": 4560641834.0, + "step": 8921 + }, + { + "epoch": 2.412655489453759, + "grad_norm": 3.296875, + "learning_rate": 0.011946504769425024, + "loss": 3.3795, + "mean_token_accuracy": 0.36947667598724365, + "num_tokens": 4561166072.0, + "step": 8922 + }, + { + "epoch": 2.4129259058950785, + "grad_norm": 2.984375, + "learning_rate": 0.011944937008042918, + "loss": 3.3551, + "mean_token_accuracy": 0.377840518951416, + "num_tokens": 4561690206.0, + "step": 8923 + }, + { + "epoch": 2.413196322336398, + "grad_norm": 2.734375, + "learning_rate": 0.011943369217667364, + "loss": 3.1937, + "mean_token_accuracy": 0.40162956714630127, + "num_tokens": 4562199553.0, + "step": 8924 + }, + { + "epoch": 2.413466738777718, + "grad_norm": 3.09375, + "learning_rate": 0.011941801398346469, + "loss": 3.1707, + "mean_token_accuracy": 0.4007144868373871, + "num_tokens": 4562723729.0, + "step": 8925 + }, + { + "epoch": 2.4137371552190374, + "grad_norm": 2.640625, + "learning_rate": 0.011940233550128338, + "loss": 3.172, + "mean_token_accuracy": 0.4064624309539795, + "num_tokens": 4563221577.0, + "step": 8926 + }, + { + "epoch": 2.414007571660357, + "grad_norm": 2.90625, + "learning_rate": 0.011938665673061075, + "loss": 2.9063, + "mean_token_accuracy": 0.4317680597305298, + "num_tokens": 4563684661.0, + "step": 8927 + }, + { + "epoch": 2.4142779881016767, + "grad_norm": 2.78125, + "learning_rate": 0.011937097767192791, + "loss": 2.9168, + "mean_token_accuracy": 0.4154502749443054, + "num_tokens": 4564182368.0, + "step": 8928 + }, + { + "epoch": 2.4145484045429964, + "grad_norm": 2.703125, + "learning_rate": 0.011935529832571592, + "loss": 3.137, + "mean_token_accuracy": 0.37930309772491455, + "num_tokens": 4564706593.0, + "step": 8929 + }, + { + "epoch": 2.414818820984316, + "grad_norm": 3.6875, + "learning_rate": 0.011933961869245585, + "loss": 3.3413, + "mean_token_accuracy": 0.39372268319129944, + "num_tokens": 4565182891.0, + "step": 8930 + }, + { + "epoch": 2.4150892374256356, + "grad_norm": 91.0, + "learning_rate": 0.01193239387726288, + "loss": 18.71, + "mean_token_accuracy": 0.0, + "num_tokens": 4565706964.0, + "step": 8931 + }, + { + "epoch": 2.4153596538669553, + "grad_norm": 5.5625, + "learning_rate": 0.01193082585667159, + "loss": 3.4518, + "mean_token_accuracy": 0.3526393175125122, + "num_tokens": 4566231084.0, + "step": 8932 + }, + { + "epoch": 2.415630070308275, + "grad_norm": 2.078125, + "learning_rate": 0.01192925780751983, + "loss": 3.2625, + "mean_token_accuracy": 0.3807923197746277, + "num_tokens": 4566755254.0, + "step": 8933 + }, + { + "epoch": 2.4159004867495946, + "grad_norm": 3.015625, + "learning_rate": 0.0119276897298557, + "loss": 3.1695, + "mean_token_accuracy": 0.4134882092475891, + "num_tokens": 4567221447.0, + "step": 8934 + }, + { + "epoch": 2.416170903190914, + "grad_norm": 2.953125, + "learning_rate": 0.011926121623727325, + "loss": 3.205, + "mean_token_accuracy": 0.39370888471603394, + "num_tokens": 4567707561.0, + "step": 8935 + }, + { + "epoch": 2.4164413196322334, + "grad_norm": 3.34375, + "learning_rate": 0.011924553489182815, + "loss": 3.2142, + "mean_token_accuracy": 0.4079784154891968, + "num_tokens": 4568231788.0, + "step": 8936 + }, + { + "epoch": 2.4167117360735535, + "grad_norm": 3.53125, + "learning_rate": 0.011922985326270286, + "loss": 3.3255, + "mean_token_accuracy": 0.38047197461128235, + "num_tokens": 4568756037.0, + "step": 8937 + }, + { + "epoch": 2.4169821525148727, + "grad_norm": 2.765625, + "learning_rate": 0.011921417135037849, + "loss": 3.047, + "mean_token_accuracy": 0.41899383068084717, + "num_tokens": 4569229709.0, + "step": 8938 + }, + { + "epoch": 2.417252568956193, + "grad_norm": 2.390625, + "learning_rate": 0.011919848915533627, + "loss": 3.1451, + "mean_token_accuracy": 0.4091995358467102, + "num_tokens": 4569753957.0, + "step": 8939 + }, + { + "epoch": 2.417522985397512, + "grad_norm": 3.078125, + "learning_rate": 0.011918280667805734, + "loss": 3.0487, + "mean_token_accuracy": 0.4235409200191498, + "num_tokens": 4570218943.0, + "step": 8940 + }, + { + "epoch": 2.4177934018388316, + "grad_norm": 3.015625, + "learning_rate": 0.011916712391902291, + "loss": 3.1777, + "mean_token_accuracy": 0.3962802588939667, + "num_tokens": 4570743120.0, + "step": 8941 + }, + { + "epoch": 2.4180638182801513, + "grad_norm": 2.71875, + "learning_rate": 0.011915144087871414, + "loss": 3.0146, + "mean_token_accuracy": 0.4159214496612549, + "num_tokens": 4571267309.0, + "step": 8942 + }, + { + "epoch": 2.418334234721471, + "grad_norm": 3.234375, + "learning_rate": 0.01191357575576123, + "loss": 3.095, + "mean_token_accuracy": 0.3810434937477112, + "num_tokens": 4571791584.0, + "step": 8943 + }, + { + "epoch": 2.4186046511627906, + "grad_norm": 2.609375, + "learning_rate": 0.01191200739561985, + "loss": 3.144, + "mean_token_accuracy": 0.3981197476387024, + "num_tokens": 4572315831.0, + "step": 8944 + }, + { + "epoch": 2.41887506760411, + "grad_norm": 2.84375, + "learning_rate": 0.011910439007495404, + "loss": 2.9433, + "mean_token_accuracy": 0.4512373208999634, + "num_tokens": 4572775166.0, + "step": 8945 + }, + { + "epoch": 2.41914548404543, + "grad_norm": 2.375, + "learning_rate": 0.01190887059143601, + "loss": 3.2077, + "mean_token_accuracy": 0.4096040725708008, + "num_tokens": 4573299440.0, + "step": 8946 + }, + { + "epoch": 2.4194159004867495, + "grad_norm": 2.9375, + "learning_rate": 0.011907302147489793, + "loss": 3.199, + "mean_token_accuracy": 0.39408165216445923, + "num_tokens": 4573802225.0, + "step": 8947 + }, + { + "epoch": 2.419686316928069, + "grad_norm": 2.6875, + "learning_rate": 0.01190573367570488, + "loss": 3.1509, + "mean_token_accuracy": 0.40698403120040894, + "num_tokens": 4574326338.0, + "step": 8948 + }, + { + "epoch": 2.4199567333693888, + "grad_norm": 3.859375, + "learning_rate": 0.011904165176129394, + "loss": 3.2024, + "mean_token_accuracy": 0.383005291223526, + "num_tokens": 4574802281.0, + "step": 8949 + }, + { + "epoch": 2.4202271498107084, + "grad_norm": 2.671875, + "learning_rate": 0.01190259664881146, + "loss": 3.2763, + "mean_token_accuracy": 0.36154207587242126, + "num_tokens": 4575326329.0, + "step": 8950 + }, + { + "epoch": 2.420497566252028, + "grad_norm": 225.0, + "learning_rate": 0.011901028093799209, + "loss": 18.4232, + "mean_token_accuracy": 0.015434720553457737, + "num_tokens": 4575801971.0, + "step": 8951 + }, + { + "epoch": 2.4207679826933477, + "grad_norm": 7.90625, + "learning_rate": 0.011899459511140763, + "loss": 3.7398, + "mean_token_accuracy": 0.32072752714157104, + "num_tokens": 4576326242.0, + "step": 8952 + }, + { + "epoch": 2.4210383991346673, + "grad_norm": 2.46875, + "learning_rate": 0.011897890900884254, + "loss": 3.2927, + "mean_token_accuracy": 0.3749314248561859, + "num_tokens": 4576850419.0, + "step": 8953 + }, + { + "epoch": 2.421308815575987, + "grad_norm": 2.015625, + "learning_rate": 0.011896322263077814, + "loss": 3.0507, + "mean_token_accuracy": 0.4082416594028473, + "num_tokens": 4577374697.0, + "step": 8954 + }, + { + "epoch": 2.4215792320173066, + "grad_norm": 3.0625, + "learning_rate": 0.011894753597769573, + "loss": 3.3866, + "mean_token_accuracy": 0.39904528856277466, + "num_tokens": 4577842472.0, + "step": 8955 + }, + { + "epoch": 2.4218496484586263, + "grad_norm": 3.015625, + "learning_rate": 0.011893184905007657, + "loss": 3.218, + "mean_token_accuracy": 0.39753270149230957, + "num_tokens": 4578366702.0, + "step": 8956 + }, + { + "epoch": 2.422120064899946, + "grad_norm": 3.015625, + "learning_rate": 0.011891616184840201, + "loss": 3.0364, + "mean_token_accuracy": 0.4006202816963196, + "num_tokens": 4578890987.0, + "step": 8957 + }, + { + "epoch": 2.4223904813412656, + "grad_norm": 3.734375, + "learning_rate": 0.01189004743731534, + "loss": 3.4077, + "mean_token_accuracy": 0.3679499328136444, + "num_tokens": 4579415213.0, + "step": 8958 + }, + { + "epoch": 2.422660897782585, + "grad_norm": 3.640625, + "learning_rate": 0.011888478662481203, + "loss": 3.2457, + "mean_token_accuracy": 0.4046422243118286, + "num_tokens": 4579915215.0, + "step": 8959 + }, + { + "epoch": 2.422931314223905, + "grad_norm": 2.5625, + "learning_rate": 0.01188690986038593, + "loss": 3.1401, + "mean_token_accuracy": 0.37980353832244873, + "num_tokens": 4580439462.0, + "step": 8960 + }, + { + "epoch": 2.4232017306652245, + "grad_norm": 3.328125, + "learning_rate": 0.011885341031077657, + "loss": 3.316, + "mean_token_accuracy": 0.38550931215286255, + "num_tokens": 4580963743.0, + "step": 8961 + }, + { + "epoch": 2.423472147106544, + "grad_norm": 2.78125, + "learning_rate": 0.011883772174604514, + "loss": 3.2636, + "mean_token_accuracy": 0.40823450684547424, + "num_tokens": 4581426951.0, + "step": 8962 + }, + { + "epoch": 2.4237425635478638, + "grad_norm": 3.203125, + "learning_rate": 0.011882203291014646, + "loss": 3.1561, + "mean_token_accuracy": 0.39695870876312256, + "num_tokens": 4581951182.0, + "step": 8963 + }, + { + "epoch": 2.4240129799891834, + "grad_norm": 3.15625, + "learning_rate": 0.011880634380356182, + "loss": 3.2813, + "mean_token_accuracy": 0.3868764340877533, + "num_tokens": 4582475353.0, + "step": 8964 + }, + { + "epoch": 2.424283396430503, + "grad_norm": 2.8125, + "learning_rate": 0.011879065442677266, + "loss": 3.0333, + "mean_token_accuracy": 0.39699026942253113, + "num_tokens": 4582999639.0, + "step": 8965 + }, + { + "epoch": 2.4245538128718227, + "grad_norm": 3.140625, + "learning_rate": 0.011877496478026037, + "loss": 2.9623, + "mean_token_accuracy": 0.3846190273761749, + "num_tokens": 4583523852.0, + "step": 8966 + }, + { + "epoch": 2.4248242293131423, + "grad_norm": 3.03125, + "learning_rate": 0.011875927486450638, + "loss": 3.0483, + "mean_token_accuracy": 0.39725667238235474, + "num_tokens": 4583999019.0, + "step": 8967 + }, + { + "epoch": 2.425094645754462, + "grad_norm": 2.75, + "learning_rate": 0.011874358467999205, + "loss": 2.9044, + "mean_token_accuracy": 0.4144865572452545, + "num_tokens": 4584523218.0, + "step": 8968 + }, + { + "epoch": 2.4253650621957816, + "grad_norm": 2.59375, + "learning_rate": 0.011872789422719887, + "loss": 3.1094, + "mean_token_accuracy": 0.4054189920425415, + "num_tokens": 4585047489.0, + "step": 8969 + }, + { + "epoch": 2.4256354786371013, + "grad_norm": 2.9375, + "learning_rate": 0.011871220350660818, + "loss": 3.2619, + "mean_token_accuracy": 0.4001712203025818, + "num_tokens": 4585514849.0, + "step": 8970 + }, + { + "epoch": 2.425905895078421, + "grad_norm": 112.0, + "learning_rate": 0.01186965125187015, + "loss": 21.2567, + "mean_token_accuracy": 0.0015944850165396929, + "num_tokens": 4586039070.0, + "step": 8971 + }, + { + "epoch": 2.4261763115197406, + "grad_norm": 5.03125, + "learning_rate": 0.011868082126396022, + "loss": 3.1495, + "mean_token_accuracy": 0.3871898651123047, + "num_tokens": 4586515470.0, + "step": 8972 + }, + { + "epoch": 2.42644672796106, + "grad_norm": 2.59375, + "learning_rate": 0.011866512974286583, + "loss": 3.0802, + "mean_token_accuracy": 0.37779563665390015, + "num_tokens": 4587039531.0, + "step": 8973 + }, + { + "epoch": 2.42671714440238, + "grad_norm": 2.28125, + "learning_rate": 0.011864943795589973, + "loss": 3.2621, + "mean_token_accuracy": 0.3946037292480469, + "num_tokens": 4587543334.0, + "step": 8974 + }, + { + "epoch": 2.4269875608436995, + "grad_norm": 3.046875, + "learning_rate": 0.01186337459035435, + "loss": 3.2045, + "mean_token_accuracy": 0.39482882618904114, + "num_tokens": 4588036930.0, + "step": 8975 + }, + { + "epoch": 2.427257977285019, + "grad_norm": 2.59375, + "learning_rate": 0.011861805358627854, + "loss": 3.2177, + "mean_token_accuracy": 0.4018220901489258, + "num_tokens": 4588561088.0, + "step": 8976 + }, + { + "epoch": 2.4275283937263383, + "grad_norm": 3.015625, + "learning_rate": 0.011860236100458632, + "loss": 3.119, + "mean_token_accuracy": 0.4008135199546814, + "num_tokens": 4589085325.0, + "step": 8977 + }, + { + "epoch": 2.4277988101676584, + "grad_norm": 2.546875, + "learning_rate": 0.011858666815894842, + "loss": 3.0572, + "mean_token_accuracy": 0.3994871973991394, + "num_tokens": 4589607524.0, + "step": 8978 + }, + { + "epoch": 2.4280692266089776, + "grad_norm": 2.59375, + "learning_rate": 0.011857097504984626, + "loss": 3.0753, + "mean_token_accuracy": 0.4045708179473877, + "num_tokens": 4590099786.0, + "step": 8979 + }, + { + "epoch": 2.4283396430502977, + "grad_norm": 2.421875, + "learning_rate": 0.011855528167776137, + "loss": 3.1402, + "mean_token_accuracy": 0.4231240451335907, + "num_tokens": 4590581276.0, + "step": 8980 + }, + { + "epoch": 2.428610059491617, + "grad_norm": 3.09375, + "learning_rate": 0.011853958804317527, + "loss": 3.1273, + "mean_token_accuracy": 0.414980411529541, + "num_tokens": 4591096719.0, + "step": 8981 + }, + { + "epoch": 2.4288804759329365, + "grad_norm": 2.59375, + "learning_rate": 0.011852389414656953, + "loss": 3.0383, + "mean_token_accuracy": 0.38693106174468994, + "num_tokens": 4591577049.0, + "step": 8982 + }, + { + "epoch": 2.429150892374256, + "grad_norm": 2.859375, + "learning_rate": 0.011850819998842563, + "loss": 2.9478, + "mean_token_accuracy": 0.4049954414367676, + "num_tokens": 4592101305.0, + "step": 8983 + }, + { + "epoch": 2.429421308815576, + "grad_norm": 2.359375, + "learning_rate": 0.011849250556922514, + "loss": 3.2214, + "mean_token_accuracy": 0.3988329768180847, + "num_tokens": 4592589510.0, + "step": 8984 + }, + { + "epoch": 2.4296917252568955, + "grad_norm": 3.28125, + "learning_rate": 0.011847681088944961, + "loss": 3.1429, + "mean_token_accuracy": 0.39357972145080566, + "num_tokens": 4593077875.0, + "step": 8985 + }, + { + "epoch": 2.429962141698215, + "grad_norm": 2.6875, + "learning_rate": 0.011846111594958058, + "loss": 3.0795, + "mean_token_accuracy": 0.4163764715194702, + "num_tokens": 4593562200.0, + "step": 8986 + }, + { + "epoch": 2.4302325581395348, + "grad_norm": 2.625, + "learning_rate": 0.011844542075009963, + "loss": 3.0593, + "mean_token_accuracy": 0.40586057305336, + "num_tokens": 4594049983.0, + "step": 8987 + }, + { + "epoch": 2.4305029745808544, + "grad_norm": 2.734375, + "learning_rate": 0.011842972529148834, + "loss": 3.0802, + "mean_token_accuracy": 0.39593231678009033, + "num_tokens": 4594574246.0, + "step": 8988 + }, + { + "epoch": 2.430773391022174, + "grad_norm": 3.03125, + "learning_rate": 0.01184140295742283, + "loss": 3.2325, + "mean_token_accuracy": 0.3984221816062927, + "num_tokens": 4595070975.0, + "step": 8989 + }, + { + "epoch": 2.4310438074634937, + "grad_norm": 2.84375, + "learning_rate": 0.011839833359880106, + "loss": 3.119, + "mean_token_accuracy": 0.39629238843917847, + "num_tokens": 4595595119.0, + "step": 8990 + }, + { + "epoch": 2.4313142239048133, + "grad_norm": 61.0, + "learning_rate": 0.011838263736568823, + "loss": 12.0207, + "mean_token_accuracy": 0.017916589975357056, + "num_tokens": 4596119304.0, + "step": 8991 + }, + { + "epoch": 2.431584640346133, + "grad_norm": 6.5625, + "learning_rate": 0.011836694087537153, + "loss": 3.3985, + "mean_token_accuracy": 0.35462522506713867, + "num_tokens": 4596643527.0, + "step": 8992 + }, + { + "epoch": 2.4318550567874526, + "grad_norm": 1.75, + "learning_rate": 0.011835124412833239, + "loss": 3.0592, + "mean_token_accuracy": 0.4003615379333496, + "num_tokens": 4597167773.0, + "step": 8993 + }, + { + "epoch": 2.4321254732287723, + "grad_norm": 2.609375, + "learning_rate": 0.011833554712505254, + "loss": 3.3203, + "mean_token_accuracy": 0.4010363519191742, + "num_tokens": 4597665442.0, + "step": 8994 + }, + { + "epoch": 2.432395889670092, + "grad_norm": 3.046875, + "learning_rate": 0.011831984986601358, + "loss": 3.0717, + "mean_token_accuracy": 0.40806034207344055, + "num_tokens": 4598189662.0, + "step": 8995 + }, + { + "epoch": 2.4326663061114115, + "grad_norm": 3.09375, + "learning_rate": 0.011830415235169717, + "loss": 3.1101, + "mean_token_accuracy": 0.43114036321640015, + "num_tokens": 4598650674.0, + "step": 8996 + }, + { + "epoch": 2.432936722552731, + "grad_norm": 25.75, + "learning_rate": 0.011828845458258496, + "loss": 2.9168, + "mean_token_accuracy": 0.4210253953933716, + "num_tokens": 4599132239.0, + "step": 8997 + }, + { + "epoch": 2.433207138994051, + "grad_norm": 5.5, + "learning_rate": 0.01182727565591586, + "loss": 3.347, + "mean_token_accuracy": 0.3522660732269287, + "num_tokens": 4599656510.0, + "step": 8998 + }, + { + "epoch": 2.4334775554353705, + "grad_norm": 2.03125, + "learning_rate": 0.01182570582818997, + "loss": 3.1611, + "mean_token_accuracy": 0.39207643270492554, + "num_tokens": 4600180737.0, + "step": 8999 + }, + { + "epoch": 2.43374797187669, + "grad_norm": 2.859375, + "learning_rate": 0.011824135975129, + "loss": 3.0819, + "mean_token_accuracy": 0.3976697325706482, + "num_tokens": 4600705017.0, + "step": 9000 + }, + { + "epoch": 2.4340183883180098, + "grad_norm": 2.875, + "learning_rate": 0.011822566096781114, + "loss": 3.1887, + "mean_token_accuracy": 0.3929237127304077, + "num_tokens": 4601229168.0, + "step": 9001 + }, + { + "epoch": 2.4342888047593294, + "grad_norm": 2.828125, + "learning_rate": 0.01182099619319448, + "loss": 3.0973, + "mean_token_accuracy": 0.3844074010848999, + "num_tokens": 4601753363.0, + "step": 9002 + }, + { + "epoch": 2.434559221200649, + "grad_norm": 2.9375, + "learning_rate": 0.011819426264417269, + "loss": 3.1713, + "mean_token_accuracy": 0.4036952555179596, + "num_tokens": 4602277541.0, + "step": 9003 + }, + { + "epoch": 2.4348296376419687, + "grad_norm": 3.8125, + "learning_rate": 0.01181785631049765, + "loss": 3.1374, + "mean_token_accuracy": 0.3884146809577942, + "num_tokens": 4602801740.0, + "step": 9004 + }, + { + "epoch": 2.4351000540832883, + "grad_norm": 3.484375, + "learning_rate": 0.011816286331483796, + "loss": 3.1923, + "mean_token_accuracy": 0.41949644684791565, + "num_tokens": 4603325988.0, + "step": 9005 + }, + { + "epoch": 2.435370470524608, + "grad_norm": 3.28125, + "learning_rate": 0.011814716327423876, + "loss": 3.2099, + "mean_token_accuracy": 0.4034154415130615, + "num_tokens": 4603850167.0, + "step": 9006 + }, + { + "epoch": 2.4356408869659276, + "grad_norm": 3.015625, + "learning_rate": 0.011813146298366063, + "loss": 3.2992, + "mean_token_accuracy": 0.4012463092803955, + "num_tokens": 4604329217.0, + "step": 9007 + }, + { + "epoch": 2.4359113034072473, + "grad_norm": 2.875, + "learning_rate": 0.011811576244358532, + "loss": 3.1316, + "mean_token_accuracy": 0.39856183528900146, + "num_tokens": 4604853412.0, + "step": 9008 + }, + { + "epoch": 2.436181719848567, + "grad_norm": 3.203125, + "learning_rate": 0.011810006165449449, + "loss": 3.223, + "mean_token_accuracy": 0.4011475443840027, + "num_tokens": 4605377668.0, + "step": 9009 + }, + { + "epoch": 2.4364521362898865, + "grad_norm": 2.65625, + "learning_rate": 0.011808436061687, + "loss": 3.079, + "mean_token_accuracy": 0.4045482575893402, + "num_tokens": 4605838842.0, + "step": 9010 + }, + { + "epoch": 2.436722552731206, + "grad_norm": 60.5, + "learning_rate": 0.011806865933119352, + "loss": 14.4328, + "mean_token_accuracy": 0.0, + "num_tokens": 4606362962.0, + "step": 9011 + }, + { + "epoch": 2.436992969172526, + "grad_norm": 5.4375, + "learning_rate": 0.01180529577979469, + "loss": 3.3352, + "mean_token_accuracy": 0.3787618577480316, + "num_tokens": 4606885611.0, + "step": 9012 + }, + { + "epoch": 2.4372633856138455, + "grad_norm": 2.0625, + "learning_rate": 0.011803725601761182, + "loss": 3.169, + "mean_token_accuracy": 0.408771812915802, + "num_tokens": 4607354140.0, + "step": 9013 + }, + { + "epoch": 2.437533802055165, + "grad_norm": 2.21875, + "learning_rate": 0.011802155399067007, + "loss": 3.1231, + "mean_token_accuracy": 0.3909578025341034, + "num_tokens": 4607878303.0, + "step": 9014 + }, + { + "epoch": 2.4378042184964848, + "grad_norm": 2.65625, + "learning_rate": 0.011800585171760348, + "loss": 3.1185, + "mean_token_accuracy": 0.3978530764579773, + "num_tokens": 4608402491.0, + "step": 9015 + }, + { + "epoch": 2.4380746349378044, + "grad_norm": 2.640625, + "learning_rate": 0.01179901491988938, + "loss": 2.9969, + "mean_token_accuracy": 0.4055306315422058, + "num_tokens": 4608926757.0, + "step": 9016 + }, + { + "epoch": 2.438345051379124, + "grad_norm": 2.703125, + "learning_rate": 0.011797444643502283, + "loss": 2.9972, + "mean_token_accuracy": 0.4226030707359314, + "num_tokens": 4609390668.0, + "step": 9017 + }, + { + "epoch": 2.4386154678204432, + "grad_norm": 2.5, + "learning_rate": 0.011795874342647242, + "loss": 3.1172, + "mean_token_accuracy": 0.3952178657054901, + "num_tokens": 4609862369.0, + "step": 9018 + }, + { + "epoch": 2.4388858842617633, + "grad_norm": 2.6875, + "learning_rate": 0.011794304017372435, + "loss": 2.7977, + "mean_token_accuracy": 0.4351855218410492, + "num_tokens": 4610346774.0, + "step": 9019 + }, + { + "epoch": 2.4391563007030825, + "grad_norm": 2.953125, + "learning_rate": 0.011792733667726046, + "loss": 3.0468, + "mean_token_accuracy": 0.40124931931495667, + "num_tokens": 4610870991.0, + "step": 9020 + }, + { + "epoch": 2.4394267171444026, + "grad_norm": 3.9375, + "learning_rate": 0.011791163293756256, + "loss": 3.3696, + "mean_token_accuracy": 0.3997042775154114, + "num_tokens": 4611395207.0, + "step": 9021 + }, + { + "epoch": 2.439697133585722, + "grad_norm": 2.890625, + "learning_rate": 0.011789592895511248, + "loss": 2.9538, + "mean_token_accuracy": 0.4024946093559265, + "num_tokens": 4611919195.0, + "step": 9022 + }, + { + "epoch": 2.4399675500270415, + "grad_norm": 3.203125, + "learning_rate": 0.01178802247303921, + "loss": 3.2344, + "mean_token_accuracy": 0.3812624216079712, + "num_tokens": 4612425930.0, + "step": 9023 + }, + { + "epoch": 2.440237966468361, + "grad_norm": 2.546875, + "learning_rate": 0.011786452026388321, + "loss": 3.0663, + "mean_token_accuracy": 0.4133418798446655, + "num_tokens": 4612894527.0, + "step": 9024 + }, + { + "epoch": 2.4405083829096808, + "grad_norm": 2.5, + "learning_rate": 0.011784881555606776, + "loss": 3.0639, + "mean_token_accuracy": 0.4225291907787323, + "num_tokens": 4613418688.0, + "step": 9025 + }, + { + "epoch": 2.4407787993510004, + "grad_norm": 2.75, + "learning_rate": 0.011783311060742754, + "loss": 3.1896, + "mean_token_accuracy": 0.38527965545654297, + "num_tokens": 4613942859.0, + "step": 9026 + }, + { + "epoch": 2.44104921579232, + "grad_norm": 2.53125, + "learning_rate": 0.011781740541844448, + "loss": 3.0771, + "mean_token_accuracy": 0.40676045417785645, + "num_tokens": 4614467133.0, + "step": 9027 + }, + { + "epoch": 2.4413196322336397, + "grad_norm": 2.53125, + "learning_rate": 0.011780169998960041, + "loss": 2.9555, + "mean_token_accuracy": 0.4203214645385742, + "num_tokens": 4614991402.0, + "step": 9028 + }, + { + "epoch": 2.4415900486749593, + "grad_norm": 2.953125, + "learning_rate": 0.011778599432137724, + "loss": 3.1602, + "mean_token_accuracy": 0.3958210349082947, + "num_tokens": 4615515689.0, + "step": 9029 + }, + { + "epoch": 2.441860465116279, + "grad_norm": 2.84375, + "learning_rate": 0.011777028841425688, + "loss": 3.0488, + "mean_token_accuracy": 0.406960666179657, + "num_tokens": 4616039936.0, + "step": 9030 + }, + { + "epoch": 2.4421308815575986, + "grad_norm": 44.25, + "learning_rate": 0.011775458226872117, + "loss": 23.0289, + "mean_token_accuracy": 0.03274368494749069, + "num_tokens": 4616505766.0, + "step": 9031 + }, + { + "epoch": 2.4424012979989183, + "grad_norm": 6.59375, + "learning_rate": 0.011773887588525211, + "loss": 3.5302, + "mean_token_accuracy": 0.37620335817337036, + "num_tokens": 4616995655.0, + "step": 9032 + }, + { + "epoch": 2.442671714440238, + "grad_norm": 2.3125, + "learning_rate": 0.011772316926433159, + "loss": 3.076, + "mean_token_accuracy": 0.39872583746910095, + "num_tokens": 4617519736.0, + "step": 9033 + }, + { + "epoch": 2.4429421308815575, + "grad_norm": 2.78125, + "learning_rate": 0.011770746240644153, + "loss": 3.141, + "mean_token_accuracy": 0.39314836263656616, + "num_tokens": 4618044011.0, + "step": 9034 + }, + { + "epoch": 2.443212547322877, + "grad_norm": 3.234375, + "learning_rate": 0.011769175531206384, + "loss": 2.9977, + "mean_token_accuracy": 0.39460131525993347, + "num_tokens": 4618568136.0, + "step": 9035 + }, + { + "epoch": 2.443482963764197, + "grad_norm": 2.453125, + "learning_rate": 0.011767604798168048, + "loss": 3.1169, + "mean_token_accuracy": 0.4163632392883301, + "num_tokens": 4619092417.0, + "step": 9036 + }, + { + "epoch": 2.4437533802055165, + "grad_norm": 2.78125, + "learning_rate": 0.01176603404157734, + "loss": 3.1234, + "mean_token_accuracy": 0.4057667851448059, + "num_tokens": 4619610495.0, + "step": 9037 + }, + { + "epoch": 2.444023796646836, + "grad_norm": 2.546875, + "learning_rate": 0.011764463261482451, + "loss": 3.031, + "mean_token_accuracy": 0.4078049659729004, + "num_tokens": 4620134729.0, + "step": 9038 + }, + { + "epoch": 2.4442942130881558, + "grad_norm": 3.28125, + "learning_rate": 0.011762892457931587, + "loss": 2.9931, + "mean_token_accuracy": 0.4000917971134186, + "num_tokens": 4620658959.0, + "step": 9039 + }, + { + "epoch": 2.4445646295294754, + "grad_norm": 2.5, + "learning_rate": 0.011761321630972934, + "loss": 3.0749, + "mean_token_accuracy": 0.41045981645584106, + "num_tokens": 4621183217.0, + "step": 9040 + }, + { + "epoch": 2.444835045970795, + "grad_norm": 2.984375, + "learning_rate": 0.011759750780654698, + "loss": 3.2128, + "mean_token_accuracy": 0.37861406803131104, + "num_tokens": 4621707496.0, + "step": 9041 + }, + { + "epoch": 2.4451054624121147, + "grad_norm": 2.703125, + "learning_rate": 0.011758179907025074, + "loss": 3.086, + "mean_token_accuracy": 0.4198523461818695, + "num_tokens": 4622180598.0, + "step": 9042 + }, + { + "epoch": 2.4453758788534343, + "grad_norm": 3.5, + "learning_rate": 0.011756609010132257, + "loss": 3.2173, + "mean_token_accuracy": 0.3831017017364502, + "num_tokens": 4622699965.0, + "step": 9043 + }, + { + "epoch": 2.445646295294754, + "grad_norm": 2.734375, + "learning_rate": 0.011755038090024452, + "loss": 3.0577, + "mean_token_accuracy": 0.4158537983894348, + "num_tokens": 4623224234.0, + "step": 9044 + }, + { + "epoch": 2.4459167117360736, + "grad_norm": 3.015625, + "learning_rate": 0.011753467146749859, + "loss": 3.1228, + "mean_token_accuracy": 0.39180976152420044, + "num_tokens": 4623739606.0, + "step": 9045 + }, + { + "epoch": 2.4461871281773933, + "grad_norm": 2.640625, + "learning_rate": 0.011751896180356675, + "loss": 2.9292, + "mean_token_accuracy": 0.39717650413513184, + "num_tokens": 4624263675.0, + "step": 9046 + }, + { + "epoch": 2.446457544618713, + "grad_norm": 3.0625, + "learning_rate": 0.01175032519089311, + "loss": 3.2155, + "mean_token_accuracy": 0.3771290183067322, + "num_tokens": 4624787954.0, + "step": 9047 + }, + { + "epoch": 2.4467279610600325, + "grad_norm": 3.203125, + "learning_rate": 0.011748754178407358, + "loss": 2.8737, + "mean_token_accuracy": 0.423004150390625, + "num_tokens": 4625253022.0, + "step": 9048 + }, + { + "epoch": 2.446998377501352, + "grad_norm": 2.671875, + "learning_rate": 0.01174718314294763, + "loss": 3.0933, + "mean_token_accuracy": 0.3849875330924988, + "num_tokens": 4625777240.0, + "step": 9049 + }, + { + "epoch": 2.447268793942672, + "grad_norm": 3.453125, + "learning_rate": 0.011745612084562124, + "loss": 3.2202, + "mean_token_accuracy": 0.40911850333213806, + "num_tokens": 4626286954.0, + "step": 9050 + }, + { + "epoch": 2.4475392103839915, + "grad_norm": 14.9375, + "learning_rate": 0.011744041003299042, + "loss": 12.6203, + "mean_token_accuracy": 8.139482815749943e-05, + "num_tokens": 4626811127.0, + "step": 9051 + }, + { + "epoch": 2.447809626825311, + "grad_norm": 6.53125, + "learning_rate": 0.011742469899206598, + "loss": 3.5319, + "mean_token_accuracy": 0.40010273456573486, + "num_tokens": 4627332511.0, + "step": 9052 + }, + { + "epoch": 2.4480800432666308, + "grad_norm": 3.390625, + "learning_rate": 0.011740898772332996, + "loss": 3.1352, + "mean_token_accuracy": 0.3930472731590271, + "num_tokens": 4627833915.0, + "step": 9053 + }, + { + "epoch": 2.4483504597079504, + "grad_norm": 2.828125, + "learning_rate": 0.011739327622726439, + "loss": 3.1698, + "mean_token_accuracy": 0.4015323519706726, + "num_tokens": 4628358084.0, + "step": 9054 + }, + { + "epoch": 2.44862087614927, + "grad_norm": 3.015625, + "learning_rate": 0.011737756450435134, + "loss": 3.1625, + "mean_token_accuracy": 0.40722084045410156, + "num_tokens": 4628851636.0, + "step": 9055 + }, + { + "epoch": 2.4488912925905897, + "grad_norm": 2.53125, + "learning_rate": 0.011736185255507292, + "loss": 3.0676, + "mean_token_accuracy": 0.4125048518180847, + "num_tokens": 4629375869.0, + "step": 9056 + }, + { + "epoch": 2.4491617090319093, + "grad_norm": 3.484375, + "learning_rate": 0.011734614037991123, + "loss": 3.1397, + "mean_token_accuracy": 0.384918212890625, + "num_tokens": 4629900119.0, + "step": 9057 + }, + { + "epoch": 2.449432125473229, + "grad_norm": 2.640625, + "learning_rate": 0.011733042797934837, + "loss": 3.1864, + "mean_token_accuracy": 0.3904256522655487, + "num_tokens": 4630421483.0, + "step": 9058 + }, + { + "epoch": 2.449702541914548, + "grad_norm": 3.359375, + "learning_rate": 0.011731471535386635, + "loss": 2.7722, + "mean_token_accuracy": 0.43270188570022583, + "num_tokens": 4630905902.0, + "step": 9059 + }, + { + "epoch": 2.4499729583558683, + "grad_norm": 1.921875, + "learning_rate": 0.011729900250394741, + "loss": 2.9107, + "mean_token_accuracy": 0.411715567111969, + "num_tokens": 4631430028.0, + "step": 9060 + }, + { + "epoch": 2.4502433747971875, + "grad_norm": 2.9375, + "learning_rate": 0.011728328943007363, + "loss": 3.1167, + "mean_token_accuracy": 0.40861451625823975, + "num_tokens": 4631954206.0, + "step": 9061 + }, + { + "epoch": 2.4505137912385075, + "grad_norm": 2.890625, + "learning_rate": 0.011726757613272704, + "loss": 3.054, + "mean_token_accuracy": 0.39286166429519653, + "num_tokens": 4632448387.0, + "step": 9062 + }, + { + "epoch": 2.4507842076798267, + "grad_norm": 2.578125, + "learning_rate": 0.01172518626123899, + "loss": 3.29, + "mean_token_accuracy": 0.3936249613761902, + "num_tokens": 4632946081.0, + "step": 9063 + }, + { + "epoch": 2.4510546241211464, + "grad_norm": 3.640625, + "learning_rate": 0.011723614886954429, + "loss": 3.3105, + "mean_token_accuracy": 0.3874187469482422, + "num_tokens": 4633430225.0, + "step": 9064 + }, + { + "epoch": 2.451325040562466, + "grad_norm": 2.5625, + "learning_rate": 0.01172204349046723, + "loss": 3.2716, + "mean_token_accuracy": 0.3982282876968384, + "num_tokens": 4633954358.0, + "step": 9065 + }, + { + "epoch": 2.4515954570037857, + "grad_norm": 3.171875, + "learning_rate": 0.011720472071825616, + "loss": 3.3244, + "mean_token_accuracy": 0.38948333263397217, + "num_tokens": 4634460549.0, + "step": 9066 + }, + { + "epoch": 2.4518658734451053, + "grad_norm": 2.1875, + "learning_rate": 0.011718900631077802, + "loss": 3.1029, + "mean_token_accuracy": 0.4078863859176636, + "num_tokens": 4634934784.0, + "step": 9067 + }, + { + "epoch": 2.452136289886425, + "grad_norm": 3.3125, + "learning_rate": 0.011717329168272003, + "loss": 3.1355, + "mean_token_accuracy": 0.40617430210113525, + "num_tokens": 4635459053.0, + "step": 9068 + }, + { + "epoch": 2.4524067063277446, + "grad_norm": 3.046875, + "learning_rate": 0.011715757683456429, + "loss": 2.9533, + "mean_token_accuracy": 0.4174022674560547, + "num_tokens": 4635983298.0, + "step": 9069 + }, + { + "epoch": 2.4526771227690642, + "grad_norm": 2.828125, + "learning_rate": 0.011714186176679312, + "loss": 3.2506, + "mean_token_accuracy": 0.41626298427581787, + "num_tokens": 4636458975.0, + "step": 9070 + }, + { + "epoch": 2.452947539210384, + "grad_norm": 860.0, + "learning_rate": 0.01171261464798886, + "loss": 28.8568, + "mean_token_accuracy": 7.862786878831685e-05, + "num_tokens": 4636925706.0, + "step": 9071 + }, + { + "epoch": 2.4532179556517035, + "grad_norm": 8.4375, + "learning_rate": 0.011711043097433293, + "loss": 3.4936, + "mean_token_accuracy": 0.38380080461502075, + "num_tokens": 4637427791.0, + "step": 9072 + }, + { + "epoch": 2.453488372093023, + "grad_norm": 2.265625, + "learning_rate": 0.011709471525060836, + "loss": 3.1492, + "mean_token_accuracy": 0.4124011993408203, + "num_tokens": 4637951944.0, + "step": 9073 + }, + { + "epoch": 2.453758788534343, + "grad_norm": 5.53125, + "learning_rate": 0.011707899930919705, + "loss": 3.1492, + "mean_token_accuracy": 0.40747880935668945, + "num_tokens": 4638443549.0, + "step": 9074 + }, + { + "epoch": 2.4540292049756625, + "grad_norm": 2.65625, + "learning_rate": 0.01170632831505812, + "loss": 3.1201, + "mean_token_accuracy": 0.3956093192100525, + "num_tokens": 4638967773.0, + "step": 9075 + }, + { + "epoch": 2.454299621416982, + "grad_norm": 2.25, + "learning_rate": 0.011704756677524306, + "loss": 3.0729, + "mean_token_accuracy": 0.3961281180381775, + "num_tokens": 4639491913.0, + "step": 9076 + }, + { + "epoch": 2.4545700378583017, + "grad_norm": 2.1875, + "learning_rate": 0.011703185018366489, + "loss": 3.2537, + "mean_token_accuracy": 0.39065954089164734, + "num_tokens": 4640016181.0, + "step": 9077 + }, + { + "epoch": 2.4548404542996214, + "grad_norm": 2.859375, + "learning_rate": 0.011701613337632883, + "loss": 3.0762, + "mean_token_accuracy": 0.39645886421203613, + "num_tokens": 4640540326.0, + "step": 9078 + }, + { + "epoch": 2.455110870740941, + "grad_norm": 2.5625, + "learning_rate": 0.011700041635371718, + "loss": 3.1218, + "mean_token_accuracy": 0.3930973410606384, + "num_tokens": 4641064597.0, + "step": 9079 + }, + { + "epoch": 2.4553812871822607, + "grad_norm": 3.46875, + "learning_rate": 0.011698469911631218, + "loss": 3.0849, + "mean_token_accuracy": 0.39424028992652893, + "num_tokens": 4641588880.0, + "step": 9080 + }, + { + "epoch": 2.4556517036235803, + "grad_norm": 3.296875, + "learning_rate": 0.011696898166459604, + "loss": 3.2679, + "mean_token_accuracy": 0.3992534577846527, + "num_tokens": 4642104342.0, + "step": 9081 + }, + { + "epoch": 2.4559221200649, + "grad_norm": 3.671875, + "learning_rate": 0.011695326399905106, + "loss": 3.0451, + "mean_token_accuracy": 0.39121031761169434, + "num_tokens": 4642628582.0, + "step": 9082 + }, + { + "epoch": 2.4561925365062196, + "grad_norm": 3.28125, + "learning_rate": 0.01169375461201595, + "loss": 3.0207, + "mean_token_accuracy": 0.4096567630767822, + "num_tokens": 4643152829.0, + "step": 9083 + }, + { + "epoch": 2.4564629529475392, + "grad_norm": 3.484375, + "learning_rate": 0.011692182802840366, + "loss": 3.315, + "mean_token_accuracy": 0.38136428594589233, + "num_tokens": 4643677016.0, + "step": 9084 + }, + { + "epoch": 2.456733369388859, + "grad_norm": 2.71875, + "learning_rate": 0.011690610972426572, + "loss": 3.2259, + "mean_token_accuracy": 0.40301382541656494, + "num_tokens": 4644201299.0, + "step": 9085 + }, + { + "epoch": 2.4570037858301785, + "grad_norm": 2.65625, + "learning_rate": 0.011689039120822805, + "loss": 3.1005, + "mean_token_accuracy": 0.4059833288192749, + "num_tokens": 4644689381.0, + "step": 9086 + }, + { + "epoch": 2.457274202271498, + "grad_norm": 2.59375, + "learning_rate": 0.01168746724807729, + "loss": 3.1637, + "mean_token_accuracy": 0.3945571780204773, + "num_tokens": 4645191425.0, + "step": 9087 + }, + { + "epoch": 2.457544618712818, + "grad_norm": 2.546875, + "learning_rate": 0.011685895354238258, + "loss": 3.0822, + "mean_token_accuracy": 0.393043577671051, + "num_tokens": 4645715625.0, + "step": 9088 + }, + { + "epoch": 2.4578150351541375, + "grad_norm": 2.546875, + "learning_rate": 0.01168432343935394, + "loss": 3.1147, + "mean_token_accuracy": 0.40672439336776733, + "num_tokens": 4646239885.0, + "step": 9089 + }, + { + "epoch": 2.458085451595457, + "grad_norm": 2.78125, + "learning_rate": 0.011682751503472569, + "loss": 2.9902, + "mean_token_accuracy": 0.4019072651863098, + "num_tokens": 4646764003.0, + "step": 9090 + }, + { + "epoch": 2.4583558680367767, + "grad_norm": 34.75, + "learning_rate": 0.011681179546642374, + "loss": 13.5045, + "mean_token_accuracy": 0.015904691070318222, + "num_tokens": 4647288117.0, + "step": 9091 + }, + { + "epoch": 2.4586262844780964, + "grad_norm": 8.0625, + "learning_rate": 0.011679607568911583, + "loss": 3.6409, + "mean_token_accuracy": 0.3458560109138489, + "num_tokens": 4647812326.0, + "step": 9092 + }, + { + "epoch": 2.458896700919416, + "grad_norm": 2.140625, + "learning_rate": 0.011678035570328435, + "loss": 3.1898, + "mean_token_accuracy": 0.40740081667900085, + "num_tokens": 4648285281.0, + "step": 9093 + }, + { + "epoch": 2.4591671173607357, + "grad_norm": 12.625, + "learning_rate": 0.011676463550941164, + "loss": 3.081, + "mean_token_accuracy": 0.36909058690071106, + "num_tokens": 4648809540.0, + "step": 9094 + }, + { + "epoch": 2.4594375338020553, + "grad_norm": 3.859375, + "learning_rate": 0.011674891510797997, + "loss": 3.4818, + "mean_token_accuracy": 0.3512692153453827, + "num_tokens": 4649333691.0, + "step": 9095 + }, + { + "epoch": 2.459707950243375, + "grad_norm": 2.21875, + "learning_rate": 0.011673319449947178, + "loss": 3.1198, + "mean_token_accuracy": 0.3845301568508148, + "num_tokens": 4649857960.0, + "step": 9096 + }, + { + "epoch": 2.4599783666846946, + "grad_norm": 2.1875, + "learning_rate": 0.01167174736843694, + "loss": 3.2768, + "mean_token_accuracy": 0.38371163606643677, + "num_tokens": 4650382242.0, + "step": 9097 + }, + { + "epoch": 2.4602487831260142, + "grad_norm": 2.953125, + "learning_rate": 0.011670175266315514, + "loss": 3.3773, + "mean_token_accuracy": 0.3903028070926666, + "num_tokens": 4650877032.0, + "step": 9098 + }, + { + "epoch": 2.460519199567334, + "grad_norm": 2.703125, + "learning_rate": 0.01166860314363114, + "loss": 3.1429, + "mean_token_accuracy": 0.3904379904270172, + "num_tokens": 4651401311.0, + "step": 9099 + }, + { + "epoch": 2.460789616008653, + "grad_norm": 3.5, + "learning_rate": 0.011667031000432058, + "loss": 3.2243, + "mean_token_accuracy": 0.400573194026947, + "num_tokens": 4651830925.0, + "step": 9100 + }, + { + "epoch": 2.461060032449973, + "grad_norm": 2.640625, + "learning_rate": 0.011665458836766498, + "loss": 3.1722, + "mean_token_accuracy": 0.4001290798187256, + "num_tokens": 4652355023.0, + "step": 9101 + }, + { + "epoch": 2.4613304488912924, + "grad_norm": 3.28125, + "learning_rate": 0.011663886652682707, + "loss": 3.2501, + "mean_token_accuracy": 0.38911399245262146, + "num_tokens": 4652879177.0, + "step": 9102 + }, + { + "epoch": 2.4616008653326125, + "grad_norm": 3.609375, + "learning_rate": 0.011662314448228921, + "loss": 2.9986, + "mean_token_accuracy": 0.4036339521408081, + "num_tokens": 4653403422.0, + "step": 9103 + }, + { + "epoch": 2.4618712817739317, + "grad_norm": 2.75, + "learning_rate": 0.01166074222345338, + "loss": 3.043, + "mean_token_accuracy": 0.40955695509910583, + "num_tokens": 4653927697.0, + "step": 9104 + }, + { + "epoch": 2.4621416982152513, + "grad_norm": 3.171875, + "learning_rate": 0.011659169978404325, + "loss": 3.0441, + "mean_token_accuracy": 0.3971255123615265, + "num_tokens": 4654451968.0, + "step": 9105 + }, + { + "epoch": 2.462412114656571, + "grad_norm": 2.96875, + "learning_rate": 0.011657597713129995, + "loss": 2.855, + "mean_token_accuracy": 0.4275124669075012, + "num_tokens": 4654976170.0, + "step": 9106 + }, + { + "epoch": 2.4626825310978906, + "grad_norm": 2.78125, + "learning_rate": 0.011656025427678636, + "loss": 3.1949, + "mean_token_accuracy": 0.39020735025405884, + "num_tokens": 4655500246.0, + "step": 9107 + }, + { + "epoch": 2.4629529475392102, + "grad_norm": 2.75, + "learning_rate": 0.011654453122098484, + "loss": 3.1103, + "mean_token_accuracy": 0.3870667815208435, + "num_tokens": 4656024414.0, + "step": 9108 + }, + { + "epoch": 2.46322336398053, + "grad_norm": 2.953125, + "learning_rate": 0.011652880796437786, + "loss": 3.3091, + "mean_token_accuracy": 0.3799186646938324, + "num_tokens": 4656548665.0, + "step": 9109 + }, + { + "epoch": 2.4634937804218495, + "grad_norm": 3.03125, + "learning_rate": 0.011651308450744788, + "loss": 3.153, + "mean_token_accuracy": 0.41687434911727905, + "num_tokens": 4657047525.0, + "step": 9110 + }, + { + "epoch": 2.463764196863169, + "grad_norm": 113.0, + "learning_rate": 0.01164973608506773, + "loss": 17.574, + "mean_token_accuracy": 0.0, + "num_tokens": 4657571777.0, + "step": 9111 + }, + { + "epoch": 2.464034613304489, + "grad_norm": 9.1875, + "learning_rate": 0.011648163699454858, + "loss": 3.7374, + "mean_token_accuracy": 0.33440786600112915, + "num_tokens": 4658095875.0, + "step": 9112 + }, + { + "epoch": 2.4643050297458085, + "grad_norm": 2.421875, + "learning_rate": 0.011646591293954421, + "loss": 3.2651, + "mean_token_accuracy": 0.39494588971138, + "num_tokens": 4658553277.0, + "step": 9113 + }, + { + "epoch": 2.464575446187128, + "grad_norm": 2.5625, + "learning_rate": 0.011645018868614656, + "loss": 3.0626, + "mean_token_accuracy": 0.41401833295822144, + "num_tokens": 4659077396.0, + "step": 9114 + }, + { + "epoch": 2.4648458626284477, + "grad_norm": 3.859375, + "learning_rate": 0.01164344642348382, + "loss": 3.4044, + "mean_token_accuracy": 0.3871498703956604, + "num_tokens": 4659601581.0, + "step": 9115 + }, + { + "epoch": 2.4651162790697674, + "grad_norm": 3.015625, + "learning_rate": 0.01164187395861015, + "loss": 3.1722, + "mean_token_accuracy": 0.3985994756221771, + "num_tokens": 4660101791.0, + "step": 9116 + }, + { + "epoch": 2.465386695511087, + "grad_norm": 2.75, + "learning_rate": 0.011640301474041903, + "loss": 3.4082, + "mean_token_accuracy": 0.38197648525238037, + "num_tokens": 4660625926.0, + "step": 9117 + }, + { + "epoch": 2.4656571119524067, + "grad_norm": 3.53125, + "learning_rate": 0.011638728969827324, + "loss": 3.1114, + "mean_token_accuracy": 0.4011053740978241, + "num_tokens": 4661150147.0, + "step": 9118 + }, + { + "epoch": 2.4659275283937263, + "grad_norm": 3.34375, + "learning_rate": 0.01163715644601466, + "loss": 3.1012, + "mean_token_accuracy": 0.41677841544151306, + "num_tokens": 4661618071.0, + "step": 9119 + }, + { + "epoch": 2.466197944835046, + "grad_norm": 3.34375, + "learning_rate": 0.011635583902652165, + "loss": 3.0803, + "mean_token_accuracy": 0.42025226354599, + "num_tokens": 4662130927.0, + "step": 9120 + }, + { + "epoch": 2.4664683612763656, + "grad_norm": 2.75, + "learning_rate": 0.011634011339788084, + "loss": 3.2087, + "mean_token_accuracy": 0.4064251184463501, + "num_tokens": 4662655150.0, + "step": 9121 + }, + { + "epoch": 2.4667387777176852, + "grad_norm": 3.625, + "learning_rate": 0.011632438757470671, + "loss": 3.2461, + "mean_token_accuracy": 0.3965235948562622, + "num_tokens": 4663179410.0, + "step": 9122 + }, + { + "epoch": 2.467009194159005, + "grad_norm": 2.90625, + "learning_rate": 0.011630866155748178, + "loss": 3.1035, + "mean_token_accuracy": 0.39056941866874695, + "num_tokens": 4663703668.0, + "step": 9123 + }, + { + "epoch": 2.4672796106003245, + "grad_norm": 2.78125, + "learning_rate": 0.011629293534668857, + "loss": 2.8104, + "mean_token_accuracy": 0.4322534501552582, + "num_tokens": 4664227848.0, + "step": 9124 + }, + { + "epoch": 2.467550027041644, + "grad_norm": 2.46875, + "learning_rate": 0.011627720894280958, + "loss": 2.9823, + "mean_token_accuracy": 0.41146546602249146, + "num_tokens": 4664720580.0, + "step": 9125 + }, + { + "epoch": 2.467820443482964, + "grad_norm": 3.15625, + "learning_rate": 0.011626148234632736, + "loss": 3.0673, + "mean_token_accuracy": 0.4001835584640503, + "num_tokens": 4665244779.0, + "step": 9126 + }, + { + "epoch": 2.4680908599242835, + "grad_norm": 2.375, + "learning_rate": 0.011624575555772447, + "loss": 3.0784, + "mean_token_accuracy": 0.4235413372516632, + "num_tokens": 4665700885.0, + "step": 9127 + }, + { + "epoch": 2.468361276365603, + "grad_norm": 2.90625, + "learning_rate": 0.011623002857748341, + "loss": 3.0876, + "mean_token_accuracy": 0.3991449475288391, + "num_tokens": 4666191717.0, + "step": 9128 + }, + { + "epoch": 2.4686316928069227, + "grad_norm": 2.484375, + "learning_rate": 0.011621430140608676, + "loss": 3.1558, + "mean_token_accuracy": 0.3962748646736145, + "num_tokens": 4666715923.0, + "step": 9129 + }, + { + "epoch": 2.4689021092482424, + "grad_norm": 3.6875, + "learning_rate": 0.011619857404401706, + "loss": 3.2148, + "mean_token_accuracy": 0.389342337846756, + "num_tokens": 4667240050.0, + "step": 9130 + }, + { + "epoch": 2.469172525689562, + "grad_norm": 2.953125, + "learning_rate": 0.011618284649175688, + "loss": 10.7355, + "mean_token_accuracy": 1.5505709598073736e-05, + "num_tokens": 4667764207.0, + "step": 9131 + }, + { + "epoch": 2.4694429421308817, + "grad_norm": 5.75, + "learning_rate": 0.011616711874978881, + "loss": 3.168, + "mean_token_accuracy": 0.3773277997970581, + "num_tokens": 4668288378.0, + "step": 9132 + }, + { + "epoch": 2.4697133585722013, + "grad_norm": 2.4375, + "learning_rate": 0.011615139081859538, + "loss": 3.1947, + "mean_token_accuracy": 0.3998534083366394, + "num_tokens": 4668812536.0, + "step": 9133 + }, + { + "epoch": 2.469983775013521, + "grad_norm": 2.625, + "learning_rate": 0.01161356626986592, + "loss": 3.2178, + "mean_token_accuracy": 0.3837314546108246, + "num_tokens": 4669336788.0, + "step": 9134 + }, + { + "epoch": 2.4702541914548406, + "grad_norm": 3.265625, + "learning_rate": 0.011611993439046283, + "loss": 3.2069, + "mean_token_accuracy": 0.40476858615875244, + "num_tokens": 4669814252.0, + "step": 9135 + }, + { + "epoch": 2.4705246078961602, + "grad_norm": 2.359375, + "learning_rate": 0.011610420589448887, + "loss": 3.1993, + "mean_token_accuracy": 0.397675484418869, + "num_tokens": 4670301641.0, + "step": 9136 + }, + { + "epoch": 2.47079502433748, + "grad_norm": 20.0, + "learning_rate": 0.011608847721121994, + "loss": 3.0894, + "mean_token_accuracy": 0.40238624811172485, + "num_tokens": 4670825831.0, + "step": 9137 + }, + { + "epoch": 2.4710654407787995, + "grad_norm": 4.0625, + "learning_rate": 0.01160727483411386, + "loss": 3.3246, + "mean_token_accuracy": 0.37544381618499756, + "num_tokens": 4671349998.0, + "step": 9138 + }, + { + "epoch": 2.471335857220119, + "grad_norm": 1.96875, + "learning_rate": 0.011605701928472751, + "loss": 3.055, + "mean_token_accuracy": 0.41369202733039856, + "num_tokens": 4671825304.0, + "step": 9139 + }, + { + "epoch": 2.471606273661439, + "grad_norm": 3.5625, + "learning_rate": 0.011604129004246925, + "loss": 3.0523, + "mean_token_accuracy": 0.39447730779647827, + "num_tokens": 4672349469.0, + "step": 9140 + }, + { + "epoch": 2.471876690102758, + "grad_norm": 2.765625, + "learning_rate": 0.011602556061484645, + "loss": 2.9221, + "mean_token_accuracy": 0.4097011089324951, + "num_tokens": 4672873691.0, + "step": 9141 + }, + { + "epoch": 2.472147106544078, + "grad_norm": 2.984375, + "learning_rate": 0.011600983100234172, + "loss": 3.1065, + "mean_token_accuracy": 0.38687339425086975, + "num_tokens": 4673397885.0, + "step": 9142 + }, + { + "epoch": 2.4724175229853973, + "grad_norm": 2.828125, + "learning_rate": 0.011599410120543772, + "loss": 2.8465, + "mean_token_accuracy": 0.41247910261154175, + "num_tokens": 4673921958.0, + "step": 9143 + }, + { + "epoch": 2.4726879394267174, + "grad_norm": 2.71875, + "learning_rate": 0.011597837122461704, + "loss": 3.193, + "mean_token_accuracy": 0.3926588296890259, + "num_tokens": 4674354023.0, + "step": 9144 + }, + { + "epoch": 2.4729583558680366, + "grad_norm": 2.671875, + "learning_rate": 0.011596264106036234, + "loss": 3.174, + "mean_token_accuracy": 0.40667521953582764, + "num_tokens": 4674875203.0, + "step": 9145 + }, + { + "epoch": 2.4732287723093562, + "grad_norm": 3.40625, + "learning_rate": 0.011594691071315633, + "loss": 3.0015, + "mean_token_accuracy": 0.42958199977874756, + "num_tokens": 4675399277.0, + "step": 9146 + }, + { + "epoch": 2.473499188750676, + "grad_norm": 2.296875, + "learning_rate": 0.011593118018348154, + "loss": 2.9208, + "mean_token_accuracy": 0.43415921926498413, + "num_tokens": 4675923546.0, + "step": 9147 + }, + { + "epoch": 2.4737696051919955, + "grad_norm": 2.96875, + "learning_rate": 0.011591544947182076, + "loss": 3.1082, + "mean_token_accuracy": 0.4026142954826355, + "num_tokens": 4676373315.0, + "step": 9148 + }, + { + "epoch": 2.474040021633315, + "grad_norm": 2.109375, + "learning_rate": 0.011589971857865656, + "loss": 2.6713, + "mean_token_accuracy": 0.4278023838996887, + "num_tokens": 4676897597.0, + "step": 9149 + }, + { + "epoch": 2.474310438074635, + "grad_norm": 2.734375, + "learning_rate": 0.011588398750447166, + "loss": 2.9346, + "mean_token_accuracy": 0.414498895406723, + "num_tokens": 4677421871.0, + "step": 9150 + }, + { + "epoch": 2.4745808545159544, + "grad_norm": 2112.0, + "learning_rate": 0.01158682562497487, + "loss": 17.8219, + "mean_token_accuracy": 0.0, + "num_tokens": 4677946123.0, + "step": 9151 + }, + { + "epoch": 2.474851270957274, + "grad_norm": 6.40625, + "learning_rate": 0.01158525248149704, + "loss": 3.4764, + "mean_token_accuracy": 0.3714095950126648, + "num_tokens": 4678470405.0, + "step": 9152 + }, + { + "epoch": 2.4751216873985937, + "grad_norm": 2.609375, + "learning_rate": 0.011583679320061943, + "loss": 3.371, + "mean_token_accuracy": 0.3686235249042511, + "num_tokens": 4678994643.0, + "step": 9153 + }, + { + "epoch": 2.4753921038399134, + "grad_norm": 3.015625, + "learning_rate": 0.011582106140717847, + "loss": 3.289, + "mean_token_accuracy": 0.4034084677696228, + "num_tokens": 4679462610.0, + "step": 9154 + }, + { + "epoch": 2.475662520281233, + "grad_norm": 3.265625, + "learning_rate": 0.011580532943513024, + "loss": 3.3877, + "mean_token_accuracy": 0.3784264326095581, + "num_tokens": 4679986881.0, + "step": 9155 + }, + { + "epoch": 2.4759329367225527, + "grad_norm": 2.625, + "learning_rate": 0.011578959728495744, + "loss": 3.0098, + "mean_token_accuracy": 0.3779352903366089, + "num_tokens": 4680511138.0, + "step": 9156 + }, + { + "epoch": 2.4762033531638723, + "grad_norm": 3.15625, + "learning_rate": 0.011577386495714272, + "loss": 2.9208, + "mean_token_accuracy": 0.40447238087654114, + "num_tokens": 4681035249.0, + "step": 9157 + }, + { + "epoch": 2.476473769605192, + "grad_norm": 2.734375, + "learning_rate": 0.01157581324521689, + "loss": 3.2112, + "mean_token_accuracy": 0.3771722912788391, + "num_tokens": 4681559401.0, + "step": 9158 + }, + { + "epoch": 2.4767441860465116, + "grad_norm": 2.5625, + "learning_rate": 0.01157423997705186, + "loss": 3.0432, + "mean_token_accuracy": 0.39593470096588135, + "num_tokens": 4682083616.0, + "step": 9159 + }, + { + "epoch": 2.4770146024878312, + "grad_norm": 5.25, + "learning_rate": 0.011572666691267459, + "loss": 2.7743, + "mean_token_accuracy": 0.4875107407569885, + "num_tokens": 4682607898.0, + "step": 9160 + }, + { + "epoch": 2.477285018929151, + "grad_norm": 2.28125, + "learning_rate": 0.011571093387911962, + "loss": 3.362, + "mean_token_accuracy": 0.39393556118011475, + "num_tokens": 4683115088.0, + "step": 9161 + }, + { + "epoch": 2.4775554353704705, + "grad_norm": 4.375, + "learning_rate": 0.011569520067033639, + "loss": 3.1565, + "mean_token_accuracy": 0.37407517433166504, + "num_tokens": 4683639308.0, + "step": 9162 + }, + { + "epoch": 2.47782585181179, + "grad_norm": 2.703125, + "learning_rate": 0.01156794672868077, + "loss": 3.4371, + "mean_token_accuracy": 0.3850458860397339, + "num_tokens": 4684101441.0, + "step": 9163 + }, + { + "epoch": 2.47809626825311, + "grad_norm": 3.28125, + "learning_rate": 0.011566373372901617, + "loss": 3.3658, + "mean_token_accuracy": 0.3920391798019409, + "num_tokens": 4684625523.0, + "step": 9164 + }, + { + "epoch": 2.4783666846944294, + "grad_norm": 3.109375, + "learning_rate": 0.011564799999744468, + "loss": 2.937, + "mean_token_accuracy": 0.3811693787574768, + "num_tokens": 4685149773.0, + "step": 9165 + }, + { + "epoch": 2.478637101135749, + "grad_norm": 2.65625, + "learning_rate": 0.011563226609257595, + "loss": 3.2119, + "mean_token_accuracy": 0.4087064266204834, + "num_tokens": 4685625397.0, + "step": 9166 + }, + { + "epoch": 2.4789075175770687, + "grad_norm": 3.625, + "learning_rate": 0.011561653201489271, + "loss": 2.9199, + "mean_token_accuracy": 0.41408002376556396, + "num_tokens": 4686149636.0, + "step": 9167 + }, + { + "epoch": 2.4791779340183884, + "grad_norm": 2.625, + "learning_rate": 0.011560079776487776, + "loss": 3.037, + "mean_token_accuracy": 0.4134300947189331, + "num_tokens": 4686673725.0, + "step": 9168 + }, + { + "epoch": 2.479448350459708, + "grad_norm": 3.0625, + "learning_rate": 0.011558506334301388, + "loss": 3.2158, + "mean_token_accuracy": 0.3775852620601654, + "num_tokens": 4687197944.0, + "step": 9169 + }, + { + "epoch": 2.4797187669010277, + "grad_norm": 2.6875, + "learning_rate": 0.01155693287497838, + "loss": 2.9614, + "mean_token_accuracy": 0.39665189385414124, + "num_tokens": 4687722114.0, + "step": 9170 + }, + { + "epoch": 2.4799891833423473, + "grad_norm": 14.5625, + "learning_rate": 0.011555359398567036, + "loss": 12.4298, + "mean_token_accuracy": 0.008107013069093227, + "num_tokens": 4688246089.0, + "step": 9171 + }, + { + "epoch": 2.480259599783667, + "grad_norm": 5.875, + "learning_rate": 0.01155378590511563, + "loss": 3.343, + "mean_token_accuracy": 0.36451417207717896, + "num_tokens": 4688770367.0, + "step": 9172 + }, + { + "epoch": 2.4805300162249866, + "grad_norm": 2.0625, + "learning_rate": 0.011552212394672445, + "loss": 3.2349, + "mean_token_accuracy": 0.3942503035068512, + "num_tokens": 4689294453.0, + "step": 9173 + }, + { + "epoch": 2.4808004326663062, + "grad_norm": 2.890625, + "learning_rate": 0.01155063886728576, + "loss": 3.0034, + "mean_token_accuracy": 0.4051628112792969, + "num_tokens": 4689818546.0, + "step": 9174 + }, + { + "epoch": 2.481070849107626, + "grad_norm": 2.421875, + "learning_rate": 0.011549065323003855, + "loss": 3.1938, + "mean_token_accuracy": 0.3913618326187134, + "num_tokens": 4690342670.0, + "step": 9175 + }, + { + "epoch": 2.4813412655489455, + "grad_norm": 2.71875, + "learning_rate": 0.011547491761875013, + "loss": 2.9817, + "mean_token_accuracy": 0.4081643223762512, + "num_tokens": 4690866908.0, + "step": 9176 + }, + { + "epoch": 2.481611681990265, + "grad_norm": 2.34375, + "learning_rate": 0.011545918183947512, + "loss": 3.2738, + "mean_token_accuracy": 0.39467066526412964, + "num_tokens": 4691391159.0, + "step": 9177 + }, + { + "epoch": 2.481882098431585, + "grad_norm": 3.390625, + "learning_rate": 0.011544344589269635, + "loss": 3.0251, + "mean_token_accuracy": 0.4005610942840576, + "num_tokens": 4691915332.0, + "step": 9178 + }, + { + "epoch": 2.4821525148729044, + "grad_norm": 2.265625, + "learning_rate": 0.011542770977889668, + "loss": 2.9874, + "mean_token_accuracy": 0.41085702180862427, + "num_tokens": 4692439451.0, + "step": 9179 + }, + { + "epoch": 2.482422931314224, + "grad_norm": 2.453125, + "learning_rate": 0.01154119734985589, + "loss": 3.1243, + "mean_token_accuracy": 0.40624624490737915, + "num_tokens": 4692922894.0, + "step": 9180 + }, + { + "epoch": 2.4826933477555437, + "grad_norm": 2.609375, + "learning_rate": 0.011539623705216582, + "loss": 3.2499, + "mean_token_accuracy": 0.41089218854904175, + "num_tokens": 4693447127.0, + "step": 9181 + }, + { + "epoch": 2.482963764196863, + "grad_norm": 3.609375, + "learning_rate": 0.011538050044020039, + "loss": 3.4072, + "mean_token_accuracy": 0.3691319227218628, + "num_tokens": 4693971297.0, + "step": 9182 + }, + { + "epoch": 2.483234180638183, + "grad_norm": 2.6875, + "learning_rate": 0.011536476366314535, + "loss": 3.274, + "mean_token_accuracy": 0.3886988162994385, + "num_tokens": 4694495475.0, + "step": 9183 + }, + { + "epoch": 2.483504597079502, + "grad_norm": 2.703125, + "learning_rate": 0.011534902672148356, + "loss": 3.3391, + "mean_token_accuracy": 0.3833008110523224, + "num_tokens": 4695019740.0, + "step": 9184 + }, + { + "epoch": 2.4837750135208223, + "grad_norm": 2.515625, + "learning_rate": 0.01153332896156979, + "loss": 3.2175, + "mean_token_accuracy": 0.41938579082489014, + "num_tokens": 4695543923.0, + "step": 9185 + }, + { + "epoch": 2.4840454299621415, + "grad_norm": 3.390625, + "learning_rate": 0.011531755234627124, + "loss": 3.2524, + "mean_token_accuracy": 0.37854674458503723, + "num_tokens": 4696068072.0, + "step": 9186 + }, + { + "epoch": 2.484315846403461, + "grad_norm": 2.375, + "learning_rate": 0.011530181491368646, + "loss": 3.1913, + "mean_token_accuracy": 0.40849676728248596, + "num_tokens": 4696592245.0, + "step": 9187 + }, + { + "epoch": 2.484586262844781, + "grad_norm": 3.140625, + "learning_rate": 0.011528607731842637, + "loss": 3.148, + "mean_token_accuracy": 0.4163086414337158, + "num_tokens": 4697116504.0, + "step": 9188 + }, + { + "epoch": 2.4848566792861004, + "grad_norm": 2.921875, + "learning_rate": 0.011527033956097393, + "loss": 3.0474, + "mean_token_accuracy": 0.4029718041419983, + "num_tokens": 4697640581.0, + "step": 9189 + }, + { + "epoch": 2.48512709572742, + "grad_norm": 2.828125, + "learning_rate": 0.011525460164181192, + "loss": 3.0463, + "mean_token_accuracy": 0.43720322847366333, + "num_tokens": 4698106657.0, + "step": 9190 + }, + { + "epoch": 2.4853975121687397, + "grad_norm": 34.0, + "learning_rate": 0.011523886356142333, + "loss": 13.3453, + "mean_token_accuracy": 1.5994221030268818e-05, + "num_tokens": 4698602583.0, + "step": 9191 + }, + { + "epoch": 2.4856679286100594, + "grad_norm": 7.1875, + "learning_rate": 0.011522312532029098, + "loss": 3.4437, + "mean_token_accuracy": 0.3570234179496765, + "num_tokens": 4699126824.0, + "step": 9192 + }, + { + "epoch": 2.485938345051379, + "grad_norm": 2.515625, + "learning_rate": 0.011520738691889775, + "loss": 3.1966, + "mean_token_accuracy": 0.39568305015563965, + "num_tokens": 4699651093.0, + "step": 9193 + }, + { + "epoch": 2.4862087614926986, + "grad_norm": 2.953125, + "learning_rate": 0.01151916483577266, + "loss": 3.1073, + "mean_token_accuracy": 0.409378319978714, + "num_tokens": 4700142075.0, + "step": 9194 + }, + { + "epoch": 2.4864791779340183, + "grad_norm": 3.15625, + "learning_rate": 0.011517590963726041, + "loss": 3.1779, + "mean_token_accuracy": 0.4017922282218933, + "num_tokens": 4700655098.0, + "step": 9195 + }, + { + "epoch": 2.486749594375338, + "grad_norm": 3.109375, + "learning_rate": 0.011516017075798211, + "loss": 3.0676, + "mean_token_accuracy": 0.4178037941455841, + "num_tokens": 4701179378.0, + "step": 9196 + }, + { + "epoch": 2.4870200108166576, + "grad_norm": 2.953125, + "learning_rate": 0.011514443172037458, + "loss": 3.103, + "mean_token_accuracy": 0.4205578863620758, + "num_tokens": 4701653739.0, + "step": 9197 + }, + { + "epoch": 2.487290427257977, + "grad_norm": 3.125, + "learning_rate": 0.011512869252492076, + "loss": 3.0988, + "mean_token_accuracy": 0.400959849357605, + "num_tokens": 4702142673.0, + "step": 9198 + }, + { + "epoch": 2.487560843699297, + "grad_norm": 27.25, + "learning_rate": 0.011511295317210358, + "loss": 3.2795, + "mean_token_accuracy": 0.4068751931190491, + "num_tokens": 4702666894.0, + "step": 9199 + }, + { + "epoch": 2.4878312601406165, + "grad_norm": 3.65625, + "learning_rate": 0.011509721366240596, + "loss": 3.0129, + "mean_token_accuracy": 0.4104446768760681, + "num_tokens": 4703191057.0, + "step": 9200 + }, + { + "epoch": 2.488101676581936, + "grad_norm": 1.7421875, + "learning_rate": 0.011508147399631076, + "loss": 2.9336, + "mean_token_accuracy": 0.38422614336013794, + "num_tokens": 4703715268.0, + "step": 9201 + }, + { + "epoch": 2.488372093023256, + "grad_norm": 2.09375, + "learning_rate": 0.01150657341743011, + "loss": 3.2371, + "mean_token_accuracy": 0.4347918629646301, + "num_tokens": 4704096878.0, + "step": 9202 + }, + { + "epoch": 2.4886425094645754, + "grad_norm": 2.4375, + "learning_rate": 0.011504999419685977, + "loss": 3.2112, + "mean_token_accuracy": 0.37965884804725647, + "num_tokens": 4704620990.0, + "step": 9203 + }, + { + "epoch": 2.488912925905895, + "grad_norm": 2.625, + "learning_rate": 0.011503425406446976, + "loss": 3.1648, + "mean_token_accuracy": 0.406541645526886, + "num_tokens": 4705133161.0, + "step": 9204 + }, + { + "epoch": 2.4891833423472147, + "grad_norm": 4.0, + "learning_rate": 0.011501851377761408, + "loss": 3.2364, + "mean_token_accuracy": 0.4033327102661133, + "num_tokens": 4705657389.0, + "step": 9205 + }, + { + "epoch": 2.4894537587885344, + "grad_norm": 4.03125, + "learning_rate": 0.01150027733367756, + "loss": 2.9054, + "mean_token_accuracy": 0.4100169241428375, + "num_tokens": 4706181660.0, + "step": 9206 + }, + { + "epoch": 2.489724175229854, + "grad_norm": 13.25, + "learning_rate": 0.011498703274243731, + "loss": 3.19, + "mean_token_accuracy": 0.40779557824134827, + "num_tokens": 4706705815.0, + "step": 9207 + }, + { + "epoch": 2.4899945916711737, + "grad_norm": 2.578125, + "learning_rate": 0.01149712919950822, + "loss": 3.4255, + "mean_token_accuracy": 0.3720315098762512, + "num_tokens": 4707213961.0, + "step": 9208 + }, + { + "epoch": 2.4902650081124933, + "grad_norm": 3.015625, + "learning_rate": 0.011495555109519322, + "loss": 3.1926, + "mean_token_accuracy": 0.3849446177482605, + "num_tokens": 4707737923.0, + "step": 9209 + }, + { + "epoch": 2.490535424553813, + "grad_norm": 3.734375, + "learning_rate": 0.011493981004325337, + "loss": 3.3135, + "mean_token_accuracy": 0.38369888067245483, + "num_tokens": 4708262105.0, + "step": 9210 + }, + { + "epoch": 2.4908058409951326, + "grad_norm": 4.125, + "learning_rate": 0.011492406883974564, + "loss": 11.6584, + "mean_token_accuracy": 5.378433343139477e-05, + "num_tokens": 4708786305.0, + "step": 9211 + }, + { + "epoch": 2.4910762574364522, + "grad_norm": 5.84375, + "learning_rate": 0.011490832748515301, + "loss": 3.5453, + "mean_token_accuracy": 0.3294658064842224, + "num_tokens": 4709310522.0, + "step": 9212 + }, + { + "epoch": 2.491346673877772, + "grad_norm": 1.734375, + "learning_rate": 0.011489258597995843, + "loss": 3.2117, + "mean_token_accuracy": 0.3734879195690155, + "num_tokens": 4709834729.0, + "step": 9213 + }, + { + "epoch": 2.4916170903190915, + "grad_norm": 2.75, + "learning_rate": 0.011487684432464491, + "loss": 3.2109, + "mean_token_accuracy": 0.3789396584033966, + "num_tokens": 4710358967.0, + "step": 9214 + }, + { + "epoch": 2.491887506760411, + "grad_norm": 3.375, + "learning_rate": 0.011486110251969547, + "loss": 3.1652, + "mean_token_accuracy": 0.3994336724281311, + "num_tokens": 4710883179.0, + "step": 9215 + }, + { + "epoch": 2.492157923201731, + "grad_norm": 2.96875, + "learning_rate": 0.011484536056559311, + "loss": 3.2184, + "mean_token_accuracy": 0.3961578905582428, + "num_tokens": 4711381645.0, + "step": 9216 + }, + { + "epoch": 2.4924283396430504, + "grad_norm": 2.8125, + "learning_rate": 0.011482961846282088, + "loss": 3.1763, + "mean_token_accuracy": 0.38712453842163086, + "num_tokens": 4711905806.0, + "step": 9217 + }, + { + "epoch": 2.49269875608437, + "grad_norm": 2.65625, + "learning_rate": 0.011481387621186168, + "loss": 3.1885, + "mean_token_accuracy": 0.3851640224456787, + "num_tokens": 4712430000.0, + "step": 9218 + }, + { + "epoch": 2.4929691725256897, + "grad_norm": 12.875, + "learning_rate": 0.011479813381319866, + "loss": 3.6265, + "mean_token_accuracy": 0.38869935274124146, + "num_tokens": 4712927807.0, + "step": 9219 + }, + { + "epoch": 2.4932395889670094, + "grad_norm": 2.890625, + "learning_rate": 0.011478239126731475, + "loss": 3.258, + "mean_token_accuracy": 0.36416834592819214, + "num_tokens": 4713451931.0, + "step": 9220 + }, + { + "epoch": 2.493510005408329, + "grad_norm": 2.1875, + "learning_rate": 0.011476664857469305, + "loss": 3.168, + "mean_token_accuracy": 0.36985060572624207, + "num_tokens": 4713976055.0, + "step": 9221 + }, + { + "epoch": 2.4937804218496487, + "grad_norm": 2.609375, + "learning_rate": 0.011475090573581653, + "loss": 3.0373, + "mean_token_accuracy": 0.3768532872200012, + "num_tokens": 4714500324.0, + "step": 9222 + }, + { + "epoch": 2.494050838290968, + "grad_norm": 3.171875, + "learning_rate": 0.011473516275116825, + "loss": 3.2201, + "mean_token_accuracy": 0.3842039108276367, + "num_tokens": 4715024600.0, + "step": 9223 + }, + { + "epoch": 2.494321254732288, + "grad_norm": 3.9375, + "learning_rate": 0.011471941962123125, + "loss": 3.0164, + "mean_token_accuracy": 0.3952593207359314, + "num_tokens": 4715548739.0, + "step": 9224 + }, + { + "epoch": 2.494591671173607, + "grad_norm": 3.1875, + "learning_rate": 0.01147036763464886, + "loss": 3.1174, + "mean_token_accuracy": 0.39118415117263794, + "num_tokens": 4716072927.0, + "step": 9225 + }, + { + "epoch": 2.4948620876149272, + "grad_norm": 2.796875, + "learning_rate": 0.01146879329274233, + "loss": 3.0392, + "mean_token_accuracy": 0.39590951800346375, + "num_tokens": 4716597178.0, + "step": 9226 + }, + { + "epoch": 2.4951325040562464, + "grad_norm": 2.765625, + "learning_rate": 0.011467218936451843, + "loss": 3.1391, + "mean_token_accuracy": 0.3978811502456665, + "num_tokens": 4717121420.0, + "step": 9227 + }, + { + "epoch": 2.495402920497566, + "grad_norm": 2.84375, + "learning_rate": 0.011465644565825708, + "loss": 3.1068, + "mean_token_accuracy": 0.3921491503715515, + "num_tokens": 4717613165.0, + "step": 9228 + }, + { + "epoch": 2.4956733369388857, + "grad_norm": 3.015625, + "learning_rate": 0.011464070180912223, + "loss": 3.2559, + "mean_token_accuracy": 0.3981170654296875, + "num_tokens": 4718137418.0, + "step": 9229 + }, + { + "epoch": 2.4959437533802054, + "grad_norm": 3.40625, + "learning_rate": 0.011462495781759703, + "loss": 3.2632, + "mean_token_accuracy": 0.3880465030670166, + "num_tokens": 4718661669.0, + "step": 9230 + }, + { + "epoch": 2.496214169821525, + "grad_norm": 30.25, + "learning_rate": 0.011460921368416453, + "loss": 12.9197, + "mean_token_accuracy": 0.013497019186615944, + "num_tokens": 4719185824.0, + "step": 9231 + }, + { + "epoch": 2.4964845862628446, + "grad_norm": 5.5, + "learning_rate": 0.01145934694093078, + "loss": 3.3545, + "mean_token_accuracy": 0.3974888324737549, + "num_tokens": 4719652673.0, + "step": 9232 + }, + { + "epoch": 2.4967550027041643, + "grad_norm": 2.03125, + "learning_rate": 0.01145777249935099, + "loss": 3.1503, + "mean_token_accuracy": 0.41238686442375183, + "num_tokens": 4720116480.0, + "step": 9233 + }, + { + "epoch": 2.497025419145484, + "grad_norm": 3.078125, + "learning_rate": 0.011456198043725397, + "loss": 2.9314, + "mean_token_accuracy": 0.4016302227973938, + "num_tokens": 4720640627.0, + "step": 9234 + }, + { + "epoch": 2.4972958355868036, + "grad_norm": 2.890625, + "learning_rate": 0.011454623574102304, + "loss": 3.1296, + "mean_token_accuracy": 0.38227221369743347, + "num_tokens": 4721164905.0, + "step": 9235 + }, + { + "epoch": 2.497566252028123, + "grad_norm": 3.390625, + "learning_rate": 0.011453049090530023, + "loss": 3.2432, + "mean_token_accuracy": 0.38993731141090393, + "num_tokens": 4721689061.0, + "step": 9236 + }, + { + "epoch": 2.497836668469443, + "grad_norm": 2.265625, + "learning_rate": 0.011451474593056864, + "loss": 2.9718, + "mean_token_accuracy": 0.4029048681259155, + "num_tokens": 4722213201.0, + "step": 9237 + }, + { + "epoch": 2.4981070849107625, + "grad_norm": 2.5625, + "learning_rate": 0.011449900081731136, + "loss": 3.1797, + "mean_token_accuracy": 0.39320939779281616, + "num_tokens": 4722737345.0, + "step": 9238 + }, + { + "epoch": 2.498377501352082, + "grad_norm": 2.640625, + "learning_rate": 0.011448325556601153, + "loss": 2.9712, + "mean_token_accuracy": 0.40793168544769287, + "num_tokens": 4723261552.0, + "step": 9239 + }, + { + "epoch": 2.498647917793402, + "grad_norm": 3.1875, + "learning_rate": 0.011446751017715221, + "loss": 3.0294, + "mean_token_accuracy": 0.3927556276321411, + "num_tokens": 4723785748.0, + "step": 9240 + }, + { + "epoch": 2.4989183342347214, + "grad_norm": 2.8125, + "learning_rate": 0.011445176465121654, + "loss": 3.0008, + "mean_token_accuracy": 0.41417044401168823, + "num_tokens": 4724310023.0, + "step": 9241 + }, + { + "epoch": 2.499188750676041, + "grad_norm": 2.65625, + "learning_rate": 0.011443601898868767, + "loss": 3.0467, + "mean_token_accuracy": 0.41125982999801636, + "num_tokens": 4724749364.0, + "step": 9242 + }, + { + "epoch": 2.4994591671173607, + "grad_norm": 2.390625, + "learning_rate": 0.011442027319004865, + "loss": 2.847, + "mean_token_accuracy": 0.4162953197956085, + "num_tokens": 4725273646.0, + "step": 9243 + }, + { + "epoch": 2.4997295835586804, + "grad_norm": 2.703125, + "learning_rate": 0.011440452725578266, + "loss": 2.8897, + "mean_token_accuracy": 0.420013964176178, + "num_tokens": 4725766468.0, + "step": 9244 + }, + { + "epoch": 2.5, + "grad_norm": 3.34375, + "learning_rate": 0.011438878118637282, + "loss": 3.2331, + "mean_token_accuracy": 0.39638498425483704, + "num_tokens": 4726238359.0, + "step": 9245 + }, + { + "epoch": 2.5002704164413196, + "grad_norm": 4.09375, + "learning_rate": 0.011437303498230227, + "loss": 3.3691, + "mean_token_accuracy": 0.3806750178337097, + "num_tokens": 4726762637.0, + "step": 9246 + }, + { + "epoch": 2.5005408328826393, + "grad_norm": 2.75, + "learning_rate": 0.011435728864405414, + "loss": 3.0469, + "mean_token_accuracy": 0.4160342216491699, + "num_tokens": 4727189678.0, + "step": 9247 + }, + { + "epoch": 2.500811249323959, + "grad_norm": 3.453125, + "learning_rate": 0.011434154217211164, + "loss": 3.0328, + "mean_token_accuracy": 0.3859174847602844, + "num_tokens": 4727713803.0, + "step": 9248 + }, + { + "epoch": 2.5010816657652786, + "grad_norm": 2.796875, + "learning_rate": 0.01143257955669578, + "loss": 3.1684, + "mean_token_accuracy": 0.3762505352497101, + "num_tokens": 4728237980.0, + "step": 9249 + }, + { + "epoch": 2.501352082206598, + "grad_norm": 2.609375, + "learning_rate": 0.011431004882907584, + "loss": 3.1075, + "mean_token_accuracy": 0.4139072299003601, + "num_tokens": 4728730821.0, + "step": 9250 + }, + { + "epoch": 2.501622498647918, + "grad_norm": 3.203125, + "learning_rate": 0.011429430195894888, + "loss": 10.2082, + "mean_token_accuracy": 0.00011724131763912737, + "num_tokens": 4729254985.0, + "step": 9251 + }, + { + "epoch": 2.5018929150892375, + "grad_norm": 6.65625, + "learning_rate": 0.011427855495706015, + "loss": 3.6617, + "mean_token_accuracy": 0.31997162103652954, + "num_tokens": 4729729367.0, + "step": 9252 + }, + { + "epoch": 2.502163331530557, + "grad_norm": 2.859375, + "learning_rate": 0.011426280782389274, + "loss": 3.1482, + "mean_token_accuracy": 0.39314359426498413, + "num_tokens": 4730226037.0, + "step": 9253 + }, + { + "epoch": 2.502433747971877, + "grad_norm": 2.890625, + "learning_rate": 0.011424706055992986, + "loss": 3.1061, + "mean_token_accuracy": 0.40816718339920044, + "num_tokens": 4730723485.0, + "step": 9254 + }, + { + "epoch": 2.5027041644131964, + "grad_norm": 3.046875, + "learning_rate": 0.01142313131656547, + "loss": 3.1985, + "mean_token_accuracy": 0.3871660828590393, + "num_tokens": 4731224551.0, + "step": 9255 + }, + { + "epoch": 2.502974580854516, + "grad_norm": 2.890625, + "learning_rate": 0.011421556564155038, + "loss": 3.2727, + "mean_token_accuracy": 0.3956957161426544, + "num_tokens": 4731748737.0, + "step": 9256 + }, + { + "epoch": 2.5032449972958357, + "grad_norm": 2.640625, + "learning_rate": 0.01141998179881001, + "loss": 2.8397, + "mean_token_accuracy": 0.400546133518219, + "num_tokens": 4732272881.0, + "step": 9257 + }, + { + "epoch": 2.5035154137371554, + "grad_norm": 3.078125, + "learning_rate": 0.011418407020578707, + "loss": 3.339, + "mean_token_accuracy": 0.3805333375930786, + "num_tokens": 4732797122.0, + "step": 9258 + }, + { + "epoch": 2.503785830178475, + "grad_norm": 3.390625, + "learning_rate": 0.011416832229509446, + "loss": 3.2528, + "mean_token_accuracy": 0.3813610076904297, + "num_tokens": 4733321359.0, + "step": 9259 + }, + { + "epoch": 2.5040562466197946, + "grad_norm": 2.4375, + "learning_rate": 0.011415257425650544, + "loss": 3.0477, + "mean_token_accuracy": 0.409995436668396, + "num_tokens": 4733845559.0, + "step": 9260 + }, + { + "epoch": 2.5043266630611143, + "grad_norm": 2.671875, + "learning_rate": 0.011413682609050329, + "loss": 3.0627, + "mean_token_accuracy": 0.4006986916065216, + "num_tokens": 4734369827.0, + "step": 9261 + }, + { + "epoch": 2.5045970795024335, + "grad_norm": 2.5625, + "learning_rate": 0.01141210777975711, + "loss": 3.0737, + "mean_token_accuracy": 0.4154985249042511, + "num_tokens": 4734893984.0, + "step": 9262 + }, + { + "epoch": 2.5048674959437536, + "grad_norm": 3.1875, + "learning_rate": 0.011410532937819214, + "loss": 3.1628, + "mean_token_accuracy": 0.3958436846733093, + "num_tokens": 4735396700.0, + "step": 9263 + }, + { + "epoch": 2.5051379123850728, + "grad_norm": 2.625, + "learning_rate": 0.011408958083284959, + "loss": 3.2702, + "mean_token_accuracy": 0.37449318170547485, + "num_tokens": 4735920889.0, + "step": 9264 + }, + { + "epoch": 2.505408328826393, + "grad_norm": 2.4375, + "learning_rate": 0.011407383216202667, + "loss": 2.9778, + "mean_token_accuracy": 0.4121473729610443, + "num_tokens": 4736444986.0, + "step": 9265 + }, + { + "epoch": 2.505678745267712, + "grad_norm": 2.265625, + "learning_rate": 0.01140580833662066, + "loss": 3.176, + "mean_token_accuracy": 0.39591842889785767, + "num_tokens": 4736969063.0, + "step": 9266 + }, + { + "epoch": 2.505949161709032, + "grad_norm": 3.15625, + "learning_rate": 0.011404233444587263, + "loss": 3.1071, + "mean_token_accuracy": 0.3922228217124939, + "num_tokens": 4737493337.0, + "step": 9267 + }, + { + "epoch": 2.5062195781503513, + "grad_norm": 2.625, + "learning_rate": 0.011402658540150793, + "loss": 3.0108, + "mean_token_accuracy": 0.38722312450408936, + "num_tokens": 4738010889.0, + "step": 9268 + }, + { + "epoch": 2.5064899945916714, + "grad_norm": 2.828125, + "learning_rate": 0.011401083623359577, + "loss": 2.8959, + "mean_token_accuracy": 0.4158981144428253, + "num_tokens": 4738535017.0, + "step": 9269 + }, + { + "epoch": 2.5067604110329906, + "grad_norm": 3.203125, + "learning_rate": 0.011399508694261934, + "loss": 3.0472, + "mean_token_accuracy": 0.40277183055877686, + "num_tokens": 4739050763.0, + "step": 9270 + }, + { + "epoch": 2.5070308274743103, + "grad_norm": 18.125, + "learning_rate": 0.011397933752906186, + "loss": 11.1364, + "mean_token_accuracy": 0.015462932176887989, + "num_tokens": 4739502772.0, + "step": 9271 + }, + { + "epoch": 2.50730124391563, + "grad_norm": 10.3125, + "learning_rate": 0.011396358799340662, + "loss": 3.3461, + "mean_token_accuracy": 0.3647582530975342, + "num_tokens": 4739997450.0, + "step": 9272 + }, + { + "epoch": 2.5075716603569496, + "grad_norm": 2.109375, + "learning_rate": 0.011394783833613687, + "loss": 3.0871, + "mean_token_accuracy": 0.39657455682754517, + "num_tokens": 4740521729.0, + "step": 9273 + }, + { + "epoch": 2.507842076798269, + "grad_norm": 2.375, + "learning_rate": 0.011393208855773584, + "loss": 2.8968, + "mean_token_accuracy": 0.4216417670249939, + "num_tokens": 4741023034.0, + "step": 9274 + }, + { + "epoch": 2.508112493239589, + "grad_norm": 2.75, + "learning_rate": 0.011391633865868674, + "loss": 3.1506, + "mean_token_accuracy": 0.40766462683677673, + "num_tokens": 4741547265.0, + "step": 9275 + }, + { + "epoch": 2.5083829096809085, + "grad_norm": 3.34375, + "learning_rate": 0.011390058863947285, + "loss": 2.9921, + "mean_token_accuracy": 0.4183560013771057, + "num_tokens": 4742071467.0, + "step": 9276 + }, + { + "epoch": 2.508653326122228, + "grad_norm": 1.890625, + "learning_rate": 0.011388483850057744, + "loss": 3.0686, + "mean_token_accuracy": 0.4083508849143982, + "num_tokens": 4742575549.0, + "step": 9277 + }, + { + "epoch": 2.5089237425635478, + "grad_norm": 2.90625, + "learning_rate": 0.011386908824248379, + "loss": 3.0697, + "mean_token_accuracy": 0.3892894983291626, + "num_tokens": 4743099806.0, + "step": 9278 + }, + { + "epoch": 2.5091941590048674, + "grad_norm": 3.1875, + "learning_rate": 0.011385333786567508, + "loss": 3.2439, + "mean_token_accuracy": 0.38609766960144043, + "num_tokens": 4743617453.0, + "step": 9279 + }, + { + "epoch": 2.509464575446187, + "grad_norm": 3.421875, + "learning_rate": 0.011383758737063465, + "loss": 3.0907, + "mean_token_accuracy": 0.3931151032447815, + "num_tokens": 4744141468.0, + "step": 9280 + }, + { + "epoch": 2.5097349918875067, + "grad_norm": 2.375, + "learning_rate": 0.011382183675784576, + "loss": 3.2054, + "mean_token_accuracy": 0.4171094298362732, + "num_tokens": 4744611487.0, + "step": 9281 + }, + { + "epoch": 2.5100054083288263, + "grad_norm": 3.359375, + "learning_rate": 0.01138060860277917, + "loss": 3.1763, + "mean_token_accuracy": 0.3882782459259033, + "num_tokens": 4745094394.0, + "step": 9282 + }, + { + "epoch": 2.510275824770146, + "grad_norm": 2.171875, + "learning_rate": 0.01137903351809557, + "loss": 3.1084, + "mean_token_accuracy": 0.407912015914917, + "num_tokens": 4745585836.0, + "step": 9283 + }, + { + "epoch": 2.5105462412114656, + "grad_norm": 3.984375, + "learning_rate": 0.01137745842178211, + "loss": 3.3009, + "mean_token_accuracy": 0.3844994604587555, + "num_tokens": 4746110051.0, + "step": 9284 + }, + { + "epoch": 2.5108166576527853, + "grad_norm": 2.65625, + "learning_rate": 0.011375883313887112, + "loss": 2.9136, + "mean_token_accuracy": 0.4120984375476837, + "num_tokens": 4746634255.0, + "step": 9285 + }, + { + "epoch": 2.511087074094105, + "grad_norm": 2.90625, + "learning_rate": 0.011374308194458909, + "loss": 2.9972, + "mean_token_accuracy": 0.41220012307167053, + "num_tokens": 4747158407.0, + "step": 9286 + }, + { + "epoch": 2.5113574905354246, + "grad_norm": 2.484375, + "learning_rate": 0.01137273306354583, + "loss": 2.9972, + "mean_token_accuracy": 0.4222108721733093, + "num_tokens": 4747641052.0, + "step": 9287 + }, + { + "epoch": 2.511627906976744, + "grad_norm": 2.78125, + "learning_rate": 0.011371157921196208, + "loss": 2.9137, + "mean_token_accuracy": 0.41834747791290283, + "num_tokens": 4748145595.0, + "step": 9288 + }, + { + "epoch": 2.511898323418064, + "grad_norm": 3.828125, + "learning_rate": 0.011369582767458364, + "loss": 3.2519, + "mean_token_accuracy": 0.3938200771808624, + "num_tokens": 4748669828.0, + "step": 9289 + }, + { + "epoch": 2.5121687398593835, + "grad_norm": 2.71875, + "learning_rate": 0.011368007602380636, + "loss": 3.1624, + "mean_token_accuracy": 0.36116814613342285, + "num_tokens": 4749194048.0, + "step": 9290 + }, + { + "epoch": 2.512439156300703, + "grad_norm": 5.875, + "learning_rate": 0.011366432426011356, + "loss": 8.9257, + "mean_token_accuracy": 8.649013034300879e-05, + "num_tokens": 4749628025.0, + "step": 9291 + }, + { + "epoch": 2.512709572742023, + "grad_norm": 6.6875, + "learning_rate": 0.011364857238398847, + "loss": 3.5585, + "mean_token_accuracy": 0.3774041533470154, + "num_tokens": 4750152304.0, + "step": 9292 + }, + { + "epoch": 2.5129799891833424, + "grad_norm": 2.203125, + "learning_rate": 0.011363282039591447, + "loss": 3.3234, + "mean_token_accuracy": 0.3737272620201111, + "num_tokens": 4750668962.0, + "step": 9293 + }, + { + "epoch": 2.513250405624662, + "grad_norm": 4.625, + "learning_rate": 0.011361706829637486, + "loss": 3.4738, + "mean_token_accuracy": 0.3868424892425537, + "num_tokens": 4751122836.0, + "step": 9294 + }, + { + "epoch": 2.5135208220659817, + "grad_norm": 2.421875, + "learning_rate": 0.011360131608585294, + "loss": 3.0847, + "mean_token_accuracy": 0.3980037569999695, + "num_tokens": 4751646995.0, + "step": 9295 + }, + { + "epoch": 2.5137912385073014, + "grad_norm": 3.34375, + "learning_rate": 0.011358556376483205, + "loss": 3.085, + "mean_token_accuracy": 0.4208531975746155, + "num_tokens": 4752171263.0, + "step": 9296 + }, + { + "epoch": 2.514061654948621, + "grad_norm": 2.5, + "learning_rate": 0.011356981133379556, + "loss": 3.325, + "mean_token_accuracy": 0.4274766743183136, + "num_tokens": 4752589411.0, + "step": 9297 + }, + { + "epoch": 2.5143320713899406, + "grad_norm": 2.53125, + "learning_rate": 0.011355405879322671, + "loss": 3.2069, + "mean_token_accuracy": 0.40509122610092163, + "num_tokens": 4753068734.0, + "step": 9298 + }, + { + "epoch": 2.5146024878312603, + "grad_norm": 3.6875, + "learning_rate": 0.01135383061436089, + "loss": 3.3547, + "mean_token_accuracy": 0.3980332016944885, + "num_tokens": 4753592936.0, + "step": 9299 + }, + { + "epoch": 2.51487290427258, + "grad_norm": 4.5, + "learning_rate": 0.011352255338542545, + "loss": 3.2405, + "mean_token_accuracy": 0.3852630853652954, + "num_tokens": 4754062934.0, + "step": 9300 + }, + { + "epoch": 2.5151433207138996, + "grad_norm": 3.265625, + "learning_rate": 0.011350680051915972, + "loss": 2.8795, + "mean_token_accuracy": 0.43233323097229004, + "num_tokens": 4754587108.0, + "step": 9301 + }, + { + "epoch": 2.515413737155219, + "grad_norm": 2.859375, + "learning_rate": 0.011349104754529502, + "loss": 3.3092, + "mean_token_accuracy": 0.37301748991012573, + "num_tokens": 4755111350.0, + "step": 9302 + }, + { + "epoch": 2.5156841535965384, + "grad_norm": 3.15625, + "learning_rate": 0.011347529446431472, + "loss": 3.0183, + "mean_token_accuracy": 0.39107146859169006, + "num_tokens": 4755635494.0, + "step": 9303 + }, + { + "epoch": 2.5159545700378585, + "grad_norm": 3.3125, + "learning_rate": 0.01134595412767022, + "loss": 3.0659, + "mean_token_accuracy": 0.3957047164440155, + "num_tokens": 4756150813.0, + "step": 9304 + }, + { + "epoch": 2.5162249864791777, + "grad_norm": 3.171875, + "learning_rate": 0.011344378798294072, + "loss": 3.0713, + "mean_token_accuracy": 0.4316893219947815, + "num_tokens": 4756610562.0, + "step": 9305 + }, + { + "epoch": 2.516495402920498, + "grad_norm": 3.6875, + "learning_rate": 0.011342803458351374, + "loss": 3.3479, + "mean_token_accuracy": 0.38690948486328125, + "num_tokens": 4757134811.0, + "step": 9306 + }, + { + "epoch": 2.516765819361817, + "grad_norm": 3.234375, + "learning_rate": 0.011341228107890455, + "loss": 3.1617, + "mean_token_accuracy": 0.40081125497817993, + "num_tokens": 4757659069.0, + "step": 9307 + }, + { + "epoch": 2.517036235803137, + "grad_norm": 3.171875, + "learning_rate": 0.011339652746959654, + "loss": 3.2685, + "mean_token_accuracy": 0.3869799077510834, + "num_tokens": 4758183355.0, + "step": 9308 + }, + { + "epoch": 2.5173066522444563, + "grad_norm": 2.828125, + "learning_rate": 0.01133807737560731, + "loss": 3.0834, + "mean_token_accuracy": 0.3974170684814453, + "num_tokens": 4758707582.0, + "step": 9309 + }, + { + "epoch": 2.5175770686857764, + "grad_norm": 3.171875, + "learning_rate": 0.011336501993881755, + "loss": 3.2035, + "mean_token_accuracy": 0.39954376220703125, + "num_tokens": 4759231814.0, + "step": 9310 + }, + { + "epoch": 2.5178474851270956, + "grad_norm": 103.5, + "learning_rate": 0.011334926601831335, + "loss": 20.1535, + "mean_token_accuracy": 0.0363098569214344, + "num_tokens": 4759756055.0, + "step": 9311 + }, + { + "epoch": 2.518117901568415, + "grad_norm": 6.21875, + "learning_rate": 0.011333351199504378, + "loss": 3.5586, + "mean_token_accuracy": 0.3392118215560913, + "num_tokens": 4760280331.0, + "step": 9312 + }, + { + "epoch": 2.518388318009735, + "grad_norm": 21.5, + "learning_rate": 0.011331775786949225, + "loss": 3.374, + "mean_token_accuracy": 0.35988280177116394, + "num_tokens": 4760767986.0, + "step": 9313 + }, + { + "epoch": 2.5186587344510545, + "grad_norm": 2.4375, + "learning_rate": 0.011330200364214216, + "loss": 3.2026, + "mean_token_accuracy": 0.37674951553344727, + "num_tokens": 4761292268.0, + "step": 9314 + }, + { + "epoch": 2.518929150892374, + "grad_norm": 2.234375, + "learning_rate": 0.011328624931347689, + "loss": 3.1069, + "mean_token_accuracy": 0.40494492650032043, + "num_tokens": 4761816504.0, + "step": 9315 + }, + { + "epoch": 2.5191995673336938, + "grad_norm": 2.8125, + "learning_rate": 0.011327049488397984, + "loss": 3.1408, + "mean_token_accuracy": 0.39943188428878784, + "num_tokens": 4762340678.0, + "step": 9316 + }, + { + "epoch": 2.5194699837750134, + "grad_norm": 2.8125, + "learning_rate": 0.011325474035413438, + "loss": 3.1839, + "mean_token_accuracy": 0.39370018243789673, + "num_tokens": 4762856410.0, + "step": 9317 + }, + { + "epoch": 2.519740400216333, + "grad_norm": 3.296875, + "learning_rate": 0.011323898572442393, + "loss": 3.1892, + "mean_token_accuracy": 0.400515615940094, + "num_tokens": 4763380684.0, + "step": 9318 + }, + { + "epoch": 2.5200108166576527, + "grad_norm": 2.734375, + "learning_rate": 0.011322323099533184, + "loss": 3.0019, + "mean_token_accuracy": 0.41194820404052734, + "num_tokens": 4763871297.0, + "step": 9319 + }, + { + "epoch": 2.5202812330989723, + "grad_norm": 3.109375, + "learning_rate": 0.011320747616734158, + "loss": 2.9758, + "mean_token_accuracy": 0.4405081272125244, + "num_tokens": 4764395445.0, + "step": 9320 + }, + { + "epoch": 2.520551649540292, + "grad_norm": 2.78125, + "learning_rate": 0.011319172124093646, + "loss": 3.26, + "mean_token_accuracy": 0.4053505063056946, + "num_tokens": 4764919617.0, + "step": 9321 + }, + { + "epoch": 2.5208220659816116, + "grad_norm": 2.875, + "learning_rate": 0.01131759662166, + "loss": 3.054, + "mean_token_accuracy": 0.39960283041000366, + "num_tokens": 4765443665.0, + "step": 9322 + }, + { + "epoch": 2.5210924824229313, + "grad_norm": 2.421875, + "learning_rate": 0.011316021109481551, + "loss": 3.2447, + "mean_token_accuracy": 0.40864065289497375, + "num_tokens": 4765967868.0, + "step": 9323 + }, + { + "epoch": 2.521362898864251, + "grad_norm": 2.609375, + "learning_rate": 0.011314445587606652, + "loss": 3.0827, + "mean_token_accuracy": 0.3948234021663666, + "num_tokens": 4766492125.0, + "step": 9324 + }, + { + "epoch": 2.5216333153055706, + "grad_norm": 2.65625, + "learning_rate": 0.01131287005608363, + "loss": 2.9751, + "mean_token_accuracy": 0.43307510018348694, + "num_tokens": 4767016336.0, + "step": 9325 + }, + { + "epoch": 2.52190373174689, + "grad_norm": 2.90625, + "learning_rate": 0.01131129451496084, + "loss": 3.1843, + "mean_token_accuracy": 0.4004720449447632, + "num_tokens": 4767512051.0, + "step": 9326 + }, + { + "epoch": 2.52217414818821, + "grad_norm": 2.484375, + "learning_rate": 0.011309718964286617, + "loss": 3.1731, + "mean_token_accuracy": 0.4137551486492157, + "num_tokens": 4768010349.0, + "step": 9327 + }, + { + "epoch": 2.5224445646295295, + "grad_norm": 3.0, + "learning_rate": 0.011308143404109308, + "loss": 3.174, + "mean_token_accuracy": 0.39597493410110474, + "num_tokens": 4768534498.0, + "step": 9328 + }, + { + "epoch": 2.522714981070849, + "grad_norm": 2.625, + "learning_rate": 0.011306567834477248, + "loss": 3.0694, + "mean_token_accuracy": 0.4106075167655945, + "num_tokens": 4769058715.0, + "step": 9329 + }, + { + "epoch": 2.5229853975121688, + "grad_norm": 3.078125, + "learning_rate": 0.011304992255438788, + "loss": 3.0327, + "mean_token_accuracy": 0.4084312319755554, + "num_tokens": 4769580660.0, + "step": 9330 + }, + { + "epoch": 2.5232558139534884, + "grad_norm": 4.65625, + "learning_rate": 0.011303416667042267, + "loss": 9.3551, + "mean_token_accuracy": 0.006791224703192711, + "num_tokens": 4770104926.0, + "step": 9331 + }, + { + "epoch": 2.523526230394808, + "grad_norm": 5.5, + "learning_rate": 0.011301841069336033, + "loss": 3.5993, + "mean_token_accuracy": 0.3800126016139984, + "num_tokens": 4770568307.0, + "step": 9332 + }, + { + "epoch": 2.5237966468361277, + "grad_norm": 2.1875, + "learning_rate": 0.011300265462368427, + "loss": 3.1091, + "mean_token_accuracy": 0.3976467251777649, + "num_tokens": 4771092590.0, + "step": 9333 + }, + { + "epoch": 2.5240670632774473, + "grad_norm": 2.28125, + "learning_rate": 0.011298689846187792, + "loss": 3.1147, + "mean_token_accuracy": 0.41058045625686646, + "num_tokens": 4771616834.0, + "step": 9334 + }, + { + "epoch": 2.524337479718767, + "grad_norm": 3.4375, + "learning_rate": 0.011297114220842477, + "loss": 3.2666, + "mean_token_accuracy": 0.3961097002029419, + "num_tokens": 4772141028.0, + "step": 9335 + }, + { + "epoch": 2.5246078961600866, + "grad_norm": 3.453125, + "learning_rate": 0.011295538586380818, + "loss": 3.1709, + "mean_token_accuracy": 0.37919801473617554, + "num_tokens": 4772655842.0, + "step": 9336 + }, + { + "epoch": 2.5248783126014063, + "grad_norm": 3.0625, + "learning_rate": 0.011293962942851173, + "loss": 3.094, + "mean_token_accuracy": 0.3876919448375702, + "num_tokens": 4773180038.0, + "step": 9337 + }, + { + "epoch": 2.525148729042726, + "grad_norm": 2.9375, + "learning_rate": 0.011292387290301876, + "loss": 3.2239, + "mean_token_accuracy": 0.38209643959999084, + "num_tokens": 4773704282.0, + "step": 9338 + }, + { + "epoch": 2.5254191454840456, + "grad_norm": 2.84375, + "learning_rate": 0.011290811628781278, + "loss": 3.1212, + "mean_token_accuracy": 0.4103730618953705, + "num_tokens": 4774173218.0, + "step": 9339 + }, + { + "epoch": 2.525689561925365, + "grad_norm": 3.0, + "learning_rate": 0.011289235958337727, + "loss": 3.0695, + "mean_token_accuracy": 0.41672366857528687, + "num_tokens": 4774697476.0, + "step": 9340 + }, + { + "epoch": 2.525959978366685, + "grad_norm": 3.28125, + "learning_rate": 0.011287660279019562, + "loss": 3.0355, + "mean_token_accuracy": 0.38547950983047485, + "num_tokens": 4775221561.0, + "step": 9341 + }, + { + "epoch": 2.5262303948080045, + "grad_norm": 2.921875, + "learning_rate": 0.011286084590875137, + "loss": 2.9613, + "mean_token_accuracy": 0.3927108645439148, + "num_tokens": 4775745812.0, + "step": 9342 + }, + { + "epoch": 2.526500811249324, + "grad_norm": 2.71875, + "learning_rate": 0.011284508893952792, + "loss": 3.0667, + "mean_token_accuracy": 0.3842747211456299, + "num_tokens": 4776270015.0, + "step": 9343 + }, + { + "epoch": 2.5267712276906433, + "grad_norm": 2.796875, + "learning_rate": 0.011282933188300879, + "loss": 3.1392, + "mean_token_accuracy": 0.4028729200363159, + "num_tokens": 4776794294.0, + "step": 9344 + }, + { + "epoch": 2.5270416441319634, + "grad_norm": 2.875, + "learning_rate": 0.011281357473967744, + "loss": 2.9565, + "mean_token_accuracy": 0.3667874336242676, + "num_tokens": 4777318477.0, + "step": 9345 + }, + { + "epoch": 2.5273120605732826, + "grad_norm": 2.125, + "learning_rate": 0.011279781751001734, + "loss": 3.1225, + "mean_token_accuracy": 0.4105612635612488, + "num_tokens": 4777842755.0, + "step": 9346 + }, + { + "epoch": 2.5275824770146027, + "grad_norm": 2.875, + "learning_rate": 0.011278206019451198, + "loss": 3.2667, + "mean_token_accuracy": 0.403381884098053, + "num_tokens": 4778361176.0, + "step": 9347 + }, + { + "epoch": 2.527852893455922, + "grad_norm": 3.15625, + "learning_rate": 0.011276630279364484, + "loss": 3.0245, + "mean_token_accuracy": 0.423324316740036, + "num_tokens": 4778860877.0, + "step": 9348 + }, + { + "epoch": 2.528123309897242, + "grad_norm": 2.59375, + "learning_rate": 0.011275054530789936, + "loss": 3.1003, + "mean_token_accuracy": 0.37941497564315796, + "num_tokens": 4779385145.0, + "step": 9349 + }, + { + "epoch": 2.528393726338561, + "grad_norm": 3.578125, + "learning_rate": 0.01127347877377591, + "loss": 3.2525, + "mean_token_accuracy": 0.39330828189849854, + "num_tokens": 4779909350.0, + "step": 9350 + }, + { + "epoch": 2.5286641427798813, + "grad_norm": 6.8125, + "learning_rate": 0.011271903008370747, + "loss": 14.0009, + "mean_token_accuracy": 0.007693595252931118, + "num_tokens": 4780422112.0, + "step": 9351 + }, + { + "epoch": 2.5289345592212005, + "grad_norm": 8.5625, + "learning_rate": 0.011270327234622802, + "loss": 3.3705, + "mean_token_accuracy": 0.380908727645874, + "num_tokens": 4780895777.0, + "step": 9352 + }, + { + "epoch": 2.52920497566252, + "grad_norm": 2.359375, + "learning_rate": 0.011268751452580423, + "loss": 3.1741, + "mean_token_accuracy": 0.38813936710357666, + "num_tokens": 4781419985.0, + "step": 9353 + }, + { + "epoch": 2.5294753921038398, + "grad_norm": 2.5625, + "learning_rate": 0.01126717566229196, + "loss": 2.889, + "mean_token_accuracy": 0.41396254301071167, + "num_tokens": 4781944181.0, + "step": 9354 + }, + { + "epoch": 2.5297458085451594, + "grad_norm": 2.625, + "learning_rate": 0.01126559986380576, + "loss": 3.1631, + "mean_token_accuracy": 0.39357417821884155, + "num_tokens": 4782468368.0, + "step": 9355 + }, + { + "epoch": 2.530016224986479, + "grad_norm": 2.65625, + "learning_rate": 0.011264024057170174, + "loss": 3.1931, + "mean_token_accuracy": 0.3901693522930145, + "num_tokens": 4782956025.0, + "step": 9356 + }, + { + "epoch": 2.5302866414277987, + "grad_norm": 3.09375, + "learning_rate": 0.011262448242433555, + "loss": 3.3527, + "mean_token_accuracy": 0.3774089217185974, + "num_tokens": 4783480233.0, + "step": 9357 + }, + { + "epoch": 2.5305570578691183, + "grad_norm": 2.90625, + "learning_rate": 0.011260872419644255, + "loss": 3.4348, + "mean_token_accuracy": 0.37151074409484863, + "num_tokens": 4784004514.0, + "step": 9358 + }, + { + "epoch": 2.530827474310438, + "grad_norm": 2.703125, + "learning_rate": 0.011259296588850618, + "loss": 2.9933, + "mean_token_accuracy": 0.39484813809394836, + "num_tokens": 4784528690.0, + "step": 9359 + }, + { + "epoch": 2.5310978907517576, + "grad_norm": 2.390625, + "learning_rate": 0.011257720750101001, + "loss": 3.091, + "mean_token_accuracy": 0.39146167039871216, + "num_tokens": 4785017086.0, + "step": 9360 + }, + { + "epoch": 2.5313683071930773, + "grad_norm": 2.75, + "learning_rate": 0.011256144903443753, + "loss": 3.1738, + "mean_token_accuracy": 0.38520413637161255, + "num_tokens": 4785541233.0, + "step": 9361 + }, + { + "epoch": 2.531638723634397, + "grad_norm": 2.296875, + "learning_rate": 0.011254569048927228, + "loss": 2.7467, + "mean_token_accuracy": 0.4123099446296692, + "num_tokens": 4786065312.0, + "step": 9362 + }, + { + "epoch": 2.5319091400757165, + "grad_norm": 2.328125, + "learning_rate": 0.011252993186599773, + "loss": 3.3378, + "mean_token_accuracy": 0.3844868540763855, + "num_tokens": 4786589520.0, + "step": 9363 + }, + { + "epoch": 2.532179556517036, + "grad_norm": 2.78125, + "learning_rate": 0.011251417316509742, + "loss": 3.1759, + "mean_token_accuracy": 0.3970085382461548, + "num_tokens": 4787113773.0, + "step": 9364 + }, + { + "epoch": 2.532449972958356, + "grad_norm": 2.671875, + "learning_rate": 0.01124984143870549, + "loss": 3.0947, + "mean_token_accuracy": 0.3945687413215637, + "num_tokens": 4787637942.0, + "step": 9365 + }, + { + "epoch": 2.5327203893996755, + "grad_norm": 3.828125, + "learning_rate": 0.011248265553235373, + "loss": 3.1991, + "mean_token_accuracy": 0.3989677131175995, + "num_tokens": 4788162180.0, + "step": 9366 + }, + { + "epoch": 2.532990805840995, + "grad_norm": 4.40625, + "learning_rate": 0.01124668966014773, + "loss": 3.1466, + "mean_token_accuracy": 0.39488881826400757, + "num_tokens": 4788686416.0, + "step": 9367 + }, + { + "epoch": 2.5332612222823148, + "grad_norm": 2.640625, + "learning_rate": 0.011245113759490928, + "loss": 2.9704, + "mean_token_accuracy": 0.40952110290527344, + "num_tokens": 4789210514.0, + "step": 9368 + }, + { + "epoch": 2.5335316387236344, + "grad_norm": 2.859375, + "learning_rate": 0.011243537851313313, + "loss": 3.2054, + "mean_token_accuracy": 0.4008028507232666, + "num_tokens": 4789734771.0, + "step": 9369 + }, + { + "epoch": 2.533802055164954, + "grad_norm": 3.484375, + "learning_rate": 0.011241961935663245, + "loss": 3.2527, + "mean_token_accuracy": 0.38309159874916077, + "num_tokens": 4790258835.0, + "step": 9370 + }, + { + "epoch": 2.5340724716062737, + "grad_norm": 81.5, + "learning_rate": 0.011240386012589065, + "loss": 11.811, + "mean_token_accuracy": 0.0002982183650601655, + "num_tokens": 4790782939.0, + "step": 9371 + }, + { + "epoch": 2.5343428880475933, + "grad_norm": 6.15625, + "learning_rate": 0.01123881008213914, + "loss": 3.3431, + "mean_token_accuracy": 0.36218029260635376, + "num_tokens": 4791307074.0, + "step": 9372 + }, + { + "epoch": 2.534613304488913, + "grad_norm": 2.59375, + "learning_rate": 0.011237234144361822, + "loss": 2.9376, + "mean_token_accuracy": 0.4395526647567749, + "num_tokens": 4791831204.0, + "step": 9373 + }, + { + "epoch": 2.5348837209302326, + "grad_norm": 2.015625, + "learning_rate": 0.011235658199305458, + "loss": 2.9989, + "mean_token_accuracy": 0.401577353477478, + "num_tokens": 4792355322.0, + "step": 9374 + }, + { + "epoch": 2.5351541373715523, + "grad_norm": 2.796875, + "learning_rate": 0.01123408224701841, + "loss": 2.9711, + "mean_token_accuracy": 0.41304346919059753, + "num_tokens": 4792879558.0, + "step": 9375 + }, + { + "epoch": 2.535424553812872, + "grad_norm": 3.046875, + "learning_rate": 0.011232506287549031, + "loss": 3.0501, + "mean_token_accuracy": 0.3872864246368408, + "num_tokens": 4793403783.0, + "step": 9376 + }, + { + "epoch": 2.5356949702541915, + "grad_norm": 3.84375, + "learning_rate": 0.011230930320945673, + "loss": 3.4108, + "mean_token_accuracy": 0.35823488235473633, + "num_tokens": 4793891181.0, + "step": 9377 + }, + { + "epoch": 2.535965386695511, + "grad_norm": 2.921875, + "learning_rate": 0.011229354347256694, + "loss": 3.1366, + "mean_token_accuracy": 0.4096079170703888, + "num_tokens": 4794415320.0, + "step": 9378 + }, + { + "epoch": 2.536235803136831, + "grad_norm": 3.59375, + "learning_rate": 0.01122777836653045, + "loss": 3.1807, + "mean_token_accuracy": 0.3785304129123688, + "num_tokens": 4794939506.0, + "step": 9379 + }, + { + "epoch": 2.5365062195781505, + "grad_norm": 3.421875, + "learning_rate": 0.011226202378815296, + "loss": 3.082, + "mean_token_accuracy": 0.3991342782974243, + "num_tokens": 4795463658.0, + "step": 9380 + }, + { + "epoch": 2.53677663601947, + "grad_norm": 2.6875, + "learning_rate": 0.01122462638415959, + "loss": 3.1175, + "mean_token_accuracy": 0.42393946647644043, + "num_tokens": 4795890417.0, + "step": 9381 + }, + { + "epoch": 2.5370470524607898, + "grad_norm": 2.765625, + "learning_rate": 0.011223050382611682, + "loss": 3.2755, + "mean_token_accuracy": 0.40251678228378296, + "num_tokens": 4796379189.0, + "step": 9382 + }, + { + "epoch": 2.5373174689021094, + "grad_norm": 4.1875, + "learning_rate": 0.011221474374219933, + "loss": 3.1741, + "mean_token_accuracy": 0.4033529460430145, + "num_tokens": 4796867383.0, + "step": 9383 + }, + { + "epoch": 2.537587885343429, + "grad_norm": 2.796875, + "learning_rate": 0.0112198983590327, + "loss": 3.2415, + "mean_token_accuracy": 0.3918412923812866, + "num_tokens": 4797391582.0, + "step": 9384 + }, + { + "epoch": 2.5378583017847482, + "grad_norm": 3.71875, + "learning_rate": 0.011218322337098338, + "loss": 3.1759, + "mean_token_accuracy": 0.40318456292152405, + "num_tokens": 4797915851.0, + "step": 9385 + }, + { + "epoch": 2.5381287182260683, + "grad_norm": 2.515625, + "learning_rate": 0.011216746308465204, + "loss": 3.2456, + "mean_token_accuracy": 0.39696866273880005, + "num_tokens": 4798440088.0, + "step": 9386 + }, + { + "epoch": 2.5383991346673875, + "grad_norm": 3.140625, + "learning_rate": 0.011215170273181656, + "loss": 2.9434, + "mean_token_accuracy": 0.4069731831550598, + "num_tokens": 4798964239.0, + "step": 9387 + }, + { + "epoch": 2.5386695511087076, + "grad_norm": 6.75, + "learning_rate": 0.011213594231296052, + "loss": 2.7871, + "mean_token_accuracy": 0.4596899449825287, + "num_tokens": 4799488494.0, + "step": 9388 + }, + { + "epoch": 2.538939967550027, + "grad_norm": 1.640625, + "learning_rate": 0.011212018182856751, + "loss": 3.0261, + "mean_token_accuracy": 0.4055536985397339, + "num_tokens": 4800012636.0, + "step": 9389 + }, + { + "epoch": 2.539210383991347, + "grad_norm": 13.8125, + "learning_rate": 0.011210442127912106, + "loss": 2.8804, + "mean_token_accuracy": 0.42106762528419495, + "num_tokens": 4800536655.0, + "step": 9390 + }, + { + "epoch": 2.539480800432666, + "grad_norm": 79.0, + "learning_rate": 0.011208866066510477, + "loss": 12.4856, + "mean_token_accuracy": 0.012674554251134396, + "num_tokens": 4801050984.0, + "step": 9391 + }, + { + "epoch": 2.539751216873986, + "grad_norm": 6.46875, + "learning_rate": 0.011207289998700225, + "loss": 3.6269, + "mean_token_accuracy": 0.3115096092224121, + "num_tokens": 4801575108.0, + "step": 9392 + }, + { + "epoch": 2.5400216333153054, + "grad_norm": 2.4375, + "learning_rate": 0.011205713924529704, + "loss": 3.1788, + "mean_token_accuracy": 0.3818354606628418, + "num_tokens": 4802058811.0, + "step": 9393 + }, + { + "epoch": 2.540292049756625, + "grad_norm": 3.796875, + "learning_rate": 0.011204137844047278, + "loss": 2.8815, + "mean_token_accuracy": 0.45921194553375244, + "num_tokens": 4802583001.0, + "step": 9394 + }, + { + "epoch": 2.5405624661979447, + "grad_norm": 3.09375, + "learning_rate": 0.0112025617573013, + "loss": 2.9839, + "mean_token_accuracy": 0.40138739347457886, + "num_tokens": 4803107172.0, + "step": 9395 + }, + { + "epoch": 2.5408328826392643, + "grad_norm": 4.15625, + "learning_rate": 0.011200985664340133, + "loss": 3.0649, + "mean_token_accuracy": 0.3955264389514923, + "num_tokens": 4803631363.0, + "step": 9396 + }, + { + "epoch": 2.541103299080584, + "grad_norm": 2.671875, + "learning_rate": 0.011199409565212134, + "loss": 2.9253, + "mean_token_accuracy": 0.419996440410614, + "num_tokens": 4804096431.0, + "step": 9397 + }, + { + "epoch": 2.5413737155219036, + "grad_norm": 3.171875, + "learning_rate": 0.01119783345996566, + "loss": 3.1384, + "mean_token_accuracy": 0.3939177989959717, + "num_tokens": 4804620494.0, + "step": 9398 + }, + { + "epoch": 2.5416441319632233, + "grad_norm": 2.765625, + "learning_rate": 0.01119625734864908, + "loss": 3.0265, + "mean_token_accuracy": 0.4221653342247009, + "num_tokens": 4805144706.0, + "step": 9399 + }, + { + "epoch": 2.541914548404543, + "grad_norm": 2.96875, + "learning_rate": 0.011194681231310741, + "loss": 3.0998, + "mean_token_accuracy": 0.4046936631202698, + "num_tokens": 4805596317.0, + "step": 9400 + }, + { + "epoch": 2.5421849648458625, + "grad_norm": 2.296875, + "learning_rate": 0.011193105107999016, + "loss": 2.7902, + "mean_token_accuracy": 0.4543359875679016, + "num_tokens": 4806075929.0, + "step": 9401 + }, + { + "epoch": 2.542455381287182, + "grad_norm": 2.5, + "learning_rate": 0.011191528978762253, + "loss": 3.1142, + "mean_token_accuracy": 0.40825900435447693, + "num_tokens": 4806600068.0, + "step": 9402 + }, + { + "epoch": 2.542725797728502, + "grad_norm": 2.9375, + "learning_rate": 0.01118995284364882, + "loss": 3.0154, + "mean_token_accuracy": 0.38817232847213745, + "num_tokens": 4807124137.0, + "step": 9403 + }, + { + "epoch": 2.5429962141698215, + "grad_norm": 2.34375, + "learning_rate": 0.011188376702707075, + "loss": 3.0521, + "mean_token_accuracy": 0.40324920415878296, + "num_tokens": 4807648187.0, + "step": 9404 + }, + { + "epoch": 2.543266630611141, + "grad_norm": 2.703125, + "learning_rate": 0.01118680055598538, + "loss": 3.2098, + "mean_token_accuracy": 0.39526063203811646, + "num_tokens": 4808172360.0, + "step": 9405 + }, + { + "epoch": 2.5435370470524608, + "grad_norm": 2.859375, + "learning_rate": 0.011185224403532092, + "loss": 3.2256, + "mean_token_accuracy": 0.38984614610671997, + "num_tokens": 4808696641.0, + "step": 9406 + }, + { + "epoch": 2.5438074634937804, + "grad_norm": 2.765625, + "learning_rate": 0.011183648245395575, + "loss": 2.9101, + "mean_token_accuracy": 0.4213428795337677, + "num_tokens": 4809220819.0, + "step": 9407 + }, + { + "epoch": 2.5440778799351, + "grad_norm": 2.921875, + "learning_rate": 0.011182072081624188, + "loss": 3.1752, + "mean_token_accuracy": 0.4055500626564026, + "num_tokens": 4809732441.0, + "step": 9408 + }, + { + "epoch": 2.5443482963764197, + "grad_norm": 3.21875, + "learning_rate": 0.0111804959122663, + "loss": 2.9785, + "mean_token_accuracy": 0.4107062816619873, + "num_tokens": 4810223947.0, + "step": 9409 + }, + { + "epoch": 2.5446187128177393, + "grad_norm": 6.34375, + "learning_rate": 0.011178919737370261, + "loss": 3.1011, + "mean_token_accuracy": 0.400752991437912, + "num_tokens": 4810748089.0, + "step": 9410 + }, + { + "epoch": 2.544889129259059, + "grad_norm": 40.5, + "learning_rate": 0.011177343556984442, + "loss": 15.0192, + "mean_token_accuracy": 0.00027805514400824904, + "num_tokens": 4811272301.0, + "step": 9411 + }, + { + "epoch": 2.5451595457003786, + "grad_norm": 3.9375, + "learning_rate": 0.011175767371157202, + "loss": 3.0589, + "mean_token_accuracy": 0.4035273790359497, + "num_tokens": 4811796519.0, + "step": 9412 + }, + { + "epoch": 2.5454299621416983, + "grad_norm": 2.890625, + "learning_rate": 0.011174191179936899, + "loss": 3.3371, + "mean_token_accuracy": 0.39127564430236816, + "num_tokens": 4812320710.0, + "step": 9413 + }, + { + "epoch": 2.545700378583018, + "grad_norm": 3.171875, + "learning_rate": 0.011172614983371895, + "loss": 3.0827, + "mean_token_accuracy": 0.3877533972263336, + "num_tokens": 4812844930.0, + "step": 9414 + }, + { + "epoch": 2.5459707950243375, + "grad_norm": 2.34375, + "learning_rate": 0.01117103878151056, + "loss": 3.2986, + "mean_token_accuracy": 0.40589508414268494, + "num_tokens": 4813335362.0, + "step": 9415 + }, + { + "epoch": 2.546241211465657, + "grad_norm": 2.859375, + "learning_rate": 0.011169462574401254, + "loss": 3.127, + "mean_token_accuracy": 0.4282298684120178, + "num_tokens": 4813760717.0, + "step": 9416 + }, + { + "epoch": 2.546511627906977, + "grad_norm": 2.984375, + "learning_rate": 0.011167886362092337, + "loss": 3.1723, + "mean_token_accuracy": 0.39563220739364624, + "num_tokens": 4814284970.0, + "step": 9417 + }, + { + "epoch": 2.5467820443482965, + "grad_norm": 2.828125, + "learning_rate": 0.01116631014463217, + "loss": 2.9889, + "mean_token_accuracy": 0.411979079246521, + "num_tokens": 4814809076.0, + "step": 9418 + }, + { + "epoch": 2.547052460789616, + "grad_norm": 2.265625, + "learning_rate": 0.011164733922069122, + "loss": 3.1919, + "mean_token_accuracy": 0.38144075870513916, + "num_tokens": 4815333259.0, + "step": 9419 + }, + { + "epoch": 2.5473228772309358, + "grad_norm": 2.78125, + "learning_rate": 0.011163157694451552, + "loss": 3.2467, + "mean_token_accuracy": 0.3817659616470337, + "num_tokens": 4815856316.0, + "step": 9420 + }, + { + "epoch": 2.5475932936722554, + "grad_norm": 2.78125, + "learning_rate": 0.01116158146182782, + "loss": 3.0386, + "mean_token_accuracy": 0.40670812129974365, + "num_tokens": 4816380525.0, + "step": 9421 + }, + { + "epoch": 2.547863710113575, + "grad_norm": 3.421875, + "learning_rate": 0.011160005224246297, + "loss": 3.2193, + "mean_token_accuracy": 0.3916189670562744, + "num_tokens": 4816855341.0, + "step": 9422 + }, + { + "epoch": 2.5481341265548947, + "grad_norm": 2.859375, + "learning_rate": 0.011158428981755342, + "loss": 3.068, + "mean_token_accuracy": 0.41843530535697937, + "num_tokens": 4817379551.0, + "step": 9423 + }, + { + "epoch": 2.5484045429962143, + "grad_norm": 2.875, + "learning_rate": 0.01115685273440332, + "loss": 3.2969, + "mean_token_accuracy": 0.37916237115859985, + "num_tokens": 4817903792.0, + "step": 9424 + }, + { + "epoch": 2.548674959437534, + "grad_norm": 2.640625, + "learning_rate": 0.011155276482238598, + "loss": 3.0697, + "mean_token_accuracy": 0.4055316150188446, + "num_tokens": 4818427973.0, + "step": 9425 + }, + { + "epoch": 2.548945375878853, + "grad_norm": 2.859375, + "learning_rate": 0.011153700225309536, + "loss": 3.1728, + "mean_token_accuracy": 0.38379302620887756, + "num_tokens": 4818907897.0, + "step": 9426 + }, + { + "epoch": 2.5492157923201733, + "grad_norm": 2.796875, + "learning_rate": 0.011152123963664495, + "loss": 3.095, + "mean_token_accuracy": 0.40490204095840454, + "num_tokens": 4819432065.0, + "step": 9427 + }, + { + "epoch": 2.5494862087614925, + "grad_norm": 3.015625, + "learning_rate": 0.011150547697351847, + "loss": 3.046, + "mean_token_accuracy": 0.39757195115089417, + "num_tokens": 4819907965.0, + "step": 9428 + }, + { + "epoch": 2.5497566252028125, + "grad_norm": 3.203125, + "learning_rate": 0.011148971426419952, + "loss": 3.0482, + "mean_token_accuracy": 0.4129876494407654, + "num_tokens": 4820370471.0, + "step": 9429 + }, + { + "epoch": 2.5500270416441317, + "grad_norm": 3.734375, + "learning_rate": 0.011147395150917174, + "loss": 3.1103, + "mean_token_accuracy": 0.3926751911640167, + "num_tokens": 4820881123.0, + "step": 9430 + }, + { + "epoch": 2.550297458085452, + "grad_norm": 23.625, + "learning_rate": 0.011145818870891881, + "loss": 14.6336, + "mean_token_accuracy": 0.00030320617952384055, + "num_tokens": 4821368087.0, + "step": 9431 + }, + { + "epoch": 2.550567874526771, + "grad_norm": 7.96875, + "learning_rate": 0.011144242586392438, + "loss": 3.7228, + "mean_token_accuracy": 0.37460583448410034, + "num_tokens": 4821839488.0, + "step": 9432 + }, + { + "epoch": 2.550838290968091, + "grad_norm": 2.640625, + "learning_rate": 0.011142666297467206, + "loss": 3.1811, + "mean_token_accuracy": 0.396851122379303, + "num_tokens": 4822363653.0, + "step": 9433 + }, + { + "epoch": 2.5511087074094103, + "grad_norm": 2.890625, + "learning_rate": 0.011141090004164552, + "loss": 2.9927, + "mean_token_accuracy": 0.3982788622379303, + "num_tokens": 4822835201.0, + "step": 9434 + }, + { + "epoch": 2.5513791238507304, + "grad_norm": 3.328125, + "learning_rate": 0.011139513706532843, + "loss": 2.8785, + "mean_token_accuracy": 0.40372198820114136, + "num_tokens": 4823359450.0, + "step": 9435 + }, + { + "epoch": 2.5516495402920496, + "grad_norm": 2.546875, + "learning_rate": 0.011137937404620442, + "loss": 3.1931, + "mean_token_accuracy": 0.39536741375923157, + "num_tokens": 4823822914.0, + "step": 9436 + }, + { + "epoch": 2.5519199567333692, + "grad_norm": 3.375, + "learning_rate": 0.011136361098475716, + "loss": 3.1119, + "mean_token_accuracy": 0.4011497497558594, + "num_tokens": 4824340651.0, + "step": 9437 + }, + { + "epoch": 2.552190373174689, + "grad_norm": 2.796875, + "learning_rate": 0.01113478478814703, + "loss": 3.2636, + "mean_token_accuracy": 0.38993215560913086, + "num_tokens": 4824864857.0, + "step": 9438 + }, + { + "epoch": 2.5524607896160085, + "grad_norm": 2.65625, + "learning_rate": 0.011133208473682755, + "loss": 3.1346, + "mean_token_accuracy": 0.4221208691596985, + "num_tokens": 4825342327.0, + "step": 9439 + }, + { + "epoch": 2.552731206057328, + "grad_norm": 2.359375, + "learning_rate": 0.011131632155131245, + "loss": 3.0031, + "mean_token_accuracy": 0.4208652973175049, + "num_tokens": 4825866595.0, + "step": 9440 + }, + { + "epoch": 2.553001622498648, + "grad_norm": 2.59375, + "learning_rate": 0.011130055832540878, + "loss": 3.0875, + "mean_token_accuracy": 0.43261152505874634, + "num_tokens": 4826327294.0, + "step": 9441 + }, + { + "epoch": 2.5532720389399675, + "grad_norm": 3.046875, + "learning_rate": 0.011128479505960014, + "loss": 3.1851, + "mean_token_accuracy": 0.4004773497581482, + "num_tokens": 4826792624.0, + "step": 9442 + }, + { + "epoch": 2.553542455381287, + "grad_norm": 3.171875, + "learning_rate": 0.011126903175437021, + "loss": 3.1854, + "mean_token_accuracy": 0.41244152188301086, + "num_tokens": 4827316841.0, + "step": 9443 + }, + { + "epoch": 2.5538128718226067, + "grad_norm": 3.578125, + "learning_rate": 0.011125326841020265, + "loss": 2.8875, + "mean_token_accuracy": 0.4587828516960144, + "num_tokens": 4827841049.0, + "step": 9444 + }, + { + "epoch": 2.5540832882639264, + "grad_norm": 2.25, + "learning_rate": 0.011123750502758115, + "loss": 2.9312, + "mean_token_accuracy": 0.41169095039367676, + "num_tokens": 4828365184.0, + "step": 9445 + }, + { + "epoch": 2.554353704705246, + "grad_norm": 2.625, + "learning_rate": 0.011122174160698935, + "loss": 3.269, + "mean_token_accuracy": 0.39566659927368164, + "num_tokens": 4828889412.0, + "step": 9446 + }, + { + "epoch": 2.5546241211465657, + "grad_norm": 3.625, + "learning_rate": 0.011120597814891092, + "loss": 3.0245, + "mean_token_accuracy": 0.4303955137729645, + "num_tokens": 4829357695.0, + "step": 9447 + }, + { + "epoch": 2.5548945375878853, + "grad_norm": 2.359375, + "learning_rate": 0.011119021465382954, + "loss": 3.0586, + "mean_token_accuracy": 0.40457695722579956, + "num_tokens": 4829854751.0, + "step": 9448 + }, + { + "epoch": 2.555164954029205, + "grad_norm": 3.265625, + "learning_rate": 0.011117445112222886, + "loss": 3.1008, + "mean_token_accuracy": 0.40129515528678894, + "num_tokens": 4830378966.0, + "step": 9449 + }, + { + "epoch": 2.5554353704705246, + "grad_norm": 3.1875, + "learning_rate": 0.011115868755459255, + "loss": 3.196, + "mean_token_accuracy": 0.4068038761615753, + "num_tokens": 4830850386.0, + "step": 9450 + }, + { + "epoch": 2.5557057869118442, + "grad_norm": 14.9375, + "learning_rate": 0.011114292395140432, + "loss": 12.1321, + "mean_token_accuracy": 6.605485395994037e-06, + "num_tokens": 4831374617.0, + "step": 9451 + }, + { + "epoch": 2.555976203353164, + "grad_norm": 7.5, + "learning_rate": 0.011112716031314784, + "loss": 3.4884, + "mean_token_accuracy": 0.354411244392395, + "num_tokens": 4831855256.0, + "step": 9452 + }, + { + "epoch": 2.5562466197944835, + "grad_norm": 2.109375, + "learning_rate": 0.011111139664030673, + "loss": 3.1775, + "mean_token_accuracy": 0.3775235414505005, + "num_tokens": 4832379495.0, + "step": 9453 + }, + { + "epoch": 2.556517036235803, + "grad_norm": 1.9765625, + "learning_rate": 0.011109563293336476, + "loss": 3.1061, + "mean_token_accuracy": 0.41246190667152405, + "num_tokens": 4832867545.0, + "step": 9454 + }, + { + "epoch": 2.556787452677123, + "grad_norm": 2.328125, + "learning_rate": 0.011107986919280551, + "loss": 3.0574, + "mean_token_accuracy": 0.4182584881782532, + "num_tokens": 4833391767.0, + "step": 9455 + }, + { + "epoch": 2.5570578691184425, + "grad_norm": 2.546875, + "learning_rate": 0.011106410541911266, + "loss": 2.9176, + "mean_token_accuracy": 0.4253835082054138, + "num_tokens": 4833915864.0, + "step": 9456 + }, + { + "epoch": 2.557328285559762, + "grad_norm": 2.40625, + "learning_rate": 0.011104834161276998, + "loss": 3.1964, + "mean_token_accuracy": 0.4011146128177643, + "num_tokens": 4834439946.0, + "step": 9457 + }, + { + "epoch": 2.5575987020010817, + "grad_norm": 3.578125, + "learning_rate": 0.011103257777426108, + "loss": 3.2559, + "mean_token_accuracy": 0.3934074342250824, + "num_tokens": 4834953708.0, + "step": 9458 + }, + { + "epoch": 2.5578691184424014, + "grad_norm": 2.4375, + "learning_rate": 0.011101681390406966, + "loss": 3.2396, + "mean_token_accuracy": 0.38736727833747864, + "num_tokens": 4835477930.0, + "step": 9459 + }, + { + "epoch": 2.558139534883721, + "grad_norm": 5.5625, + "learning_rate": 0.011100105000267941, + "loss": 3.0417, + "mean_token_accuracy": 0.4137689471244812, + "num_tokens": 4836002160.0, + "step": 9460 + }, + { + "epoch": 2.5584099513250407, + "grad_norm": 2.265625, + "learning_rate": 0.011098528607057397, + "loss": 2.8928, + "mean_token_accuracy": 0.4122866988182068, + "num_tokens": 4836526262.0, + "step": 9461 + }, + { + "epoch": 2.5586803677663603, + "grad_norm": 3.5625, + "learning_rate": 0.01109695221082371, + "loss": 3.0742, + "mean_token_accuracy": 0.42273426055908203, + "num_tokens": 4837003964.0, + "step": 9462 + }, + { + "epoch": 2.55895078420768, + "grad_norm": 2.59375, + "learning_rate": 0.01109537581161524, + "loss": 3.1219, + "mean_token_accuracy": 0.40206077694892883, + "num_tokens": 4837476804.0, + "step": 9463 + }, + { + "epoch": 2.5592212006489996, + "grad_norm": 3.546875, + "learning_rate": 0.01109379940948036, + "loss": 3.0385, + "mean_token_accuracy": 0.40957707166671753, + "num_tokens": 4838001066.0, + "step": 9464 + }, + { + "epoch": 2.5594916170903192, + "grad_norm": 2.3125, + "learning_rate": 0.01109222300446744, + "loss": 2.8728, + "mean_token_accuracy": 0.42698290944099426, + "num_tokens": 4838525232.0, + "step": 9465 + }, + { + "epoch": 2.559762033531639, + "grad_norm": 3.0, + "learning_rate": 0.01109064659662485, + "loss": 3.1676, + "mean_token_accuracy": 0.40627366304397583, + "num_tokens": 4839049411.0, + "step": 9466 + }, + { + "epoch": 2.560032449972958, + "grad_norm": 3.75, + "learning_rate": 0.011089070186000952, + "loss": 3.1541, + "mean_token_accuracy": 0.4147503674030304, + "num_tokens": 4839573688.0, + "step": 9467 + }, + { + "epoch": 2.560302866414278, + "grad_norm": 3.34375, + "learning_rate": 0.01108749377264412, + "loss": 3.186, + "mean_token_accuracy": 0.3876967430114746, + "num_tokens": 4840097972.0, + "step": 9468 + }, + { + "epoch": 2.5605732828555974, + "grad_norm": 2.484375, + "learning_rate": 0.011085917356602722, + "loss": 2.9961, + "mean_token_accuracy": 0.4268967807292938, + "num_tokens": 4840621965.0, + "step": 9469 + }, + { + "epoch": 2.5608436992969175, + "grad_norm": 3.140625, + "learning_rate": 0.011084340937925123, + "loss": 3.0892, + "mean_token_accuracy": 0.4070492386817932, + "num_tokens": 4841136167.0, + "step": 9470 + }, + { + "epoch": 2.5611141157382367, + "grad_norm": 36.0, + "learning_rate": 0.0110827645166597, + "loss": 13.8488, + "mean_token_accuracy": 0.004367310553789139, + "num_tokens": 4841660433.0, + "step": 9471 + }, + { + "epoch": 2.5613845321795568, + "grad_norm": 5.09375, + "learning_rate": 0.011081188092854815, + "loss": 3.338, + "mean_token_accuracy": 0.38983553647994995, + "num_tokens": 4842184640.0, + "step": 9472 + }, + { + "epoch": 2.561654948620876, + "grad_norm": 2.03125, + "learning_rate": 0.011079611666558844, + "loss": 3.1697, + "mean_token_accuracy": 0.41091281175613403, + "num_tokens": 4842692966.0, + "step": 9473 + }, + { + "epoch": 2.561925365062196, + "grad_norm": 3.15625, + "learning_rate": 0.01107803523782015, + "loss": 3.1924, + "mean_token_accuracy": 0.3876243531703949, + "num_tokens": 4843217149.0, + "step": 9474 + }, + { + "epoch": 2.5621957815035152, + "grad_norm": 3.3125, + "learning_rate": 0.011076458806687109, + "loss": 3.2115, + "mean_token_accuracy": 0.3943013548851013, + "num_tokens": 4843741387.0, + "step": 9475 + }, + { + "epoch": 2.5624661979448353, + "grad_norm": 2.875, + "learning_rate": 0.011074882373208084, + "loss": 3.1608, + "mean_token_accuracy": 0.38045933842658997, + "num_tokens": 4844265530.0, + "step": 9476 + }, + { + "epoch": 2.5627366143861545, + "grad_norm": 3.234375, + "learning_rate": 0.011073305937431449, + "loss": 3.083, + "mean_token_accuracy": 0.4045414328575134, + "num_tokens": 4844789747.0, + "step": 9477 + }, + { + "epoch": 2.563007030827474, + "grad_norm": 3.65625, + "learning_rate": 0.011071729499405571, + "loss": 2.9424, + "mean_token_accuracy": 0.4024518132209778, + "num_tokens": 4845313911.0, + "step": 9478 + }, + { + "epoch": 2.563277447268794, + "grad_norm": 3.21875, + "learning_rate": 0.01107015305917882, + "loss": 3.1982, + "mean_token_accuracy": 0.3918704390525818, + "num_tokens": 4845832419.0, + "step": 9479 + }, + { + "epoch": 2.5635478637101135, + "grad_norm": 3.203125, + "learning_rate": 0.011068576616799568, + "loss": 3.1711, + "mean_token_accuracy": 0.40068912506103516, + "num_tokens": 4846356615.0, + "step": 9480 + }, + { + "epoch": 2.563818280151433, + "grad_norm": 2.921875, + "learning_rate": 0.011067000172316186, + "loss": 3.008, + "mean_token_accuracy": 0.41697001457214355, + "num_tokens": 4846857999.0, + "step": 9481 + }, + { + "epoch": 2.5640886965927527, + "grad_norm": 2.859375, + "learning_rate": 0.011065423725777039, + "loss": 3.2721, + "mean_token_accuracy": 0.3985365331172943, + "num_tokens": 4847382097.0, + "step": 9482 + }, + { + "epoch": 2.5643591130340724, + "grad_norm": 2.578125, + "learning_rate": 0.0110638472772305, + "loss": 3.0219, + "mean_token_accuracy": 0.4004139304161072, + "num_tokens": 4847906321.0, + "step": 9483 + }, + { + "epoch": 2.564629529475392, + "grad_norm": 2.484375, + "learning_rate": 0.011062270826724941, + "loss": 3.1585, + "mean_token_accuracy": 0.39863407611846924, + "num_tokens": 4848430461.0, + "step": 9484 + }, + { + "epoch": 2.5648999459167117, + "grad_norm": 2.703125, + "learning_rate": 0.01106069437430873, + "loss": 2.9902, + "mean_token_accuracy": 0.458584189414978, + "num_tokens": 4848866490.0, + "step": 9485 + }, + { + "epoch": 2.5651703623580313, + "grad_norm": 2.4375, + "learning_rate": 0.011059117920030233, + "loss": 3.1183, + "mean_token_accuracy": 0.415706604719162, + "num_tokens": 4849390579.0, + "step": 9486 + }, + { + "epoch": 2.565440778799351, + "grad_norm": 3.125, + "learning_rate": 0.011057541463937826, + "loss": 3.026, + "mean_token_accuracy": 0.421955406665802, + "num_tokens": 4849914779.0, + "step": 9487 + }, + { + "epoch": 2.5657111952406706, + "grad_norm": 2.796875, + "learning_rate": 0.01105596500607988, + "loss": 3.0932, + "mean_token_accuracy": 0.4057433307170868, + "num_tokens": 4850438883.0, + "step": 9488 + }, + { + "epoch": 2.5659816116819902, + "grad_norm": 2.71875, + "learning_rate": 0.011054388546504763, + "loss": 3.003, + "mean_token_accuracy": 0.4101995825767517, + "num_tokens": 4850963055.0, + "step": 9489 + }, + { + "epoch": 2.56625202812331, + "grad_norm": 2.90625, + "learning_rate": 0.011052812085260844, + "loss": 3.0933, + "mean_token_accuracy": 0.39043641090393066, + "num_tokens": 4851487241.0, + "step": 9490 + }, + { + "epoch": 2.5665224445646295, + "grad_norm": 4.875, + "learning_rate": 0.011051235622396493, + "loss": 11.0281, + "mean_token_accuracy": 2.4929369828896597e-05, + "num_tokens": 4851994429.0, + "step": 9491 + }, + { + "epoch": 2.566792861005949, + "grad_norm": 6.03125, + "learning_rate": 0.011049659157960084, + "loss": 3.4193, + "mean_token_accuracy": 0.3792874217033386, + "num_tokens": 4852518617.0, + "step": 9492 + }, + { + "epoch": 2.567063277447269, + "grad_norm": 2.140625, + "learning_rate": 0.011048082691999988, + "loss": 3.1167, + "mean_token_accuracy": 0.38438957929611206, + "num_tokens": 4853042843.0, + "step": 9493 + }, + { + "epoch": 2.5673336938885885, + "grad_norm": 2.78125, + "learning_rate": 0.011046506224564571, + "loss": 3.2301, + "mean_token_accuracy": 0.3971208930015564, + "num_tokens": 4853530942.0, + "step": 9494 + }, + { + "epoch": 2.567604110329908, + "grad_norm": 3.40625, + "learning_rate": 0.011044929755702206, + "loss": 2.919, + "mean_token_accuracy": 0.44758230447769165, + "num_tokens": 4854055098.0, + "step": 9495 + }, + { + "epoch": 2.5678745267712277, + "grad_norm": 2.296875, + "learning_rate": 0.011043353285461266, + "loss": 3.0848, + "mean_token_accuracy": 0.4134315252304077, + "num_tokens": 4854521028.0, + "step": 9496 + }, + { + "epoch": 2.5681449432125474, + "grad_norm": 2.953125, + "learning_rate": 0.011041776813890118, + "loss": 3.0188, + "mean_token_accuracy": 0.4131750762462616, + "num_tokens": 4855045156.0, + "step": 9497 + }, + { + "epoch": 2.568415359653867, + "grad_norm": 2.59375, + "learning_rate": 0.01104020034103714, + "loss": 2.8964, + "mean_token_accuracy": 0.4131077229976654, + "num_tokens": 4855569255.0, + "step": 9498 + }, + { + "epoch": 2.5686857760951867, + "grad_norm": 2.59375, + "learning_rate": 0.011038623866950687, + "loss": 3.164, + "mean_token_accuracy": 0.401559442281723, + "num_tokens": 4856062406.0, + "step": 9499 + }, + { + "epoch": 2.5689561925365063, + "grad_norm": 4.0625, + "learning_rate": 0.011037047391679147, + "loss": 3.1709, + "mean_token_accuracy": 0.3976708650588989, + "num_tokens": 4856578977.0, + "step": 9500 + }, + { + "epoch": 2.569226608977826, + "grad_norm": 3.984375, + "learning_rate": 0.011035470915270884, + "loss": 3.3173, + "mean_token_accuracy": 0.3837759494781494, + "num_tokens": 4857103254.0, + "step": 9501 + }, + { + "epoch": 2.5694970254191456, + "grad_norm": 3.25, + "learning_rate": 0.011033894437774267, + "loss": 3.0923, + "mean_token_accuracy": 0.37028348445892334, + "num_tokens": 4857627507.0, + "step": 9502 + }, + { + "epoch": 2.5697674418604652, + "grad_norm": 2.875, + "learning_rate": 0.011032317959237672, + "loss": 3.2555, + "mean_token_accuracy": 0.41060352325439453, + "num_tokens": 4858151727.0, + "step": 9503 + }, + { + "epoch": 2.570037858301785, + "grad_norm": 3.515625, + "learning_rate": 0.011030741479709465, + "loss": 3.24, + "mean_token_accuracy": 0.3984195590019226, + "num_tokens": 4858675884.0, + "step": 9504 + }, + { + "epoch": 2.5703082747431045, + "grad_norm": 2.34375, + "learning_rate": 0.01102916499923802, + "loss": 3.0798, + "mean_token_accuracy": 0.40533262491226196, + "num_tokens": 4859195164.0, + "step": 9505 + }, + { + "epoch": 2.570578691184424, + "grad_norm": 3.15625, + "learning_rate": 0.011027588517871703, + "loss": 3.0691, + "mean_token_accuracy": 0.4110162854194641, + "num_tokens": 4859661769.0, + "step": 9506 + }, + { + "epoch": 2.570849107625744, + "grad_norm": 17.875, + "learning_rate": 0.01102601203565889, + "loss": 3.3863, + "mean_token_accuracy": 0.3736954927444458, + "num_tokens": 4860186046.0, + "step": 9507 + }, + { + "epoch": 2.571119524067063, + "grad_norm": 3.234375, + "learning_rate": 0.011024435552647956, + "loss": 3.3059, + "mean_token_accuracy": 0.37774571776390076, + "num_tokens": 4860710318.0, + "step": 9508 + }, + { + "epoch": 2.571389940508383, + "grad_norm": 1.9453125, + "learning_rate": 0.011022859068887263, + "loss": 2.9398, + "mean_token_accuracy": 0.40016424655914307, + "num_tokens": 4861234551.0, + "step": 9509 + }, + { + "epoch": 2.5716603569497023, + "grad_norm": 2.765625, + "learning_rate": 0.011021282584425189, + "loss": 3.0394, + "mean_token_accuracy": 0.42026758193969727, + "num_tokens": 4861714219.0, + "step": 9510 + }, + { + "epoch": 2.5719307733910224, + "grad_norm": 104.0, + "learning_rate": 0.011019706099310102, + "loss": 18.0539, + "mean_token_accuracy": 0.010822962038218975, + "num_tokens": 4862238370.0, + "step": 9511 + }, + { + "epoch": 2.5722011898323416, + "grad_norm": 5.75, + "learning_rate": 0.011018129613590374, + "loss": 3.4263, + "mean_token_accuracy": 0.35582199692726135, + "num_tokens": 4862762644.0, + "step": 9512 + }, + { + "epoch": 2.5724716062736617, + "grad_norm": 2.0625, + "learning_rate": 0.011016553127314378, + "loss": 3.1602, + "mean_token_accuracy": 0.40206676721572876, + "num_tokens": 4863286831.0, + "step": 9513 + }, + { + "epoch": 2.572742022714981, + "grad_norm": 2.5625, + "learning_rate": 0.01101497664053048, + "loss": 2.9969, + "mean_token_accuracy": 0.4200349450111389, + "num_tokens": 4863795022.0, + "step": 9514 + }, + { + "epoch": 2.573012439156301, + "grad_norm": 3.015625, + "learning_rate": 0.011013400153287056, + "loss": 3.2979, + "mean_token_accuracy": 0.3871118724346161, + "num_tokens": 4864319244.0, + "step": 9515 + }, + { + "epoch": 2.57328285559762, + "grad_norm": 2.90625, + "learning_rate": 0.011011823665632477, + "loss": 3.1232, + "mean_token_accuracy": 0.38376495242118835, + "num_tokens": 4864843349.0, + "step": 9516 + }, + { + "epoch": 2.5735532720389402, + "grad_norm": 3.1875, + "learning_rate": 0.011010247177615111, + "loss": 3.1963, + "mean_token_accuracy": 0.4020845890045166, + "num_tokens": 4865367518.0, + "step": 9517 + }, + { + "epoch": 2.5738236884802594, + "grad_norm": 4.0625, + "learning_rate": 0.011008670689283333, + "loss": 3.3315, + "mean_token_accuracy": 0.38957479596138, + "num_tokens": 4865891792.0, + "step": 9518 + }, + { + "epoch": 2.574094104921579, + "grad_norm": 3.59375, + "learning_rate": 0.011007094200685512, + "loss": 3.2432, + "mean_token_accuracy": 0.39974308013916016, + "num_tokens": 4866416049.0, + "step": 9519 + }, + { + "epoch": 2.5743645213628987, + "grad_norm": 3.4375, + "learning_rate": 0.011005517711870019, + "loss": 2.9334, + "mean_token_accuracy": 0.40674668550491333, + "num_tokens": 4866926605.0, + "step": 9520 + }, + { + "epoch": 2.5746349378042184, + "grad_norm": 2.65625, + "learning_rate": 0.011003941222885227, + "loss": 3.2678, + "mean_token_accuracy": 0.39494234323501587, + "num_tokens": 4867419594.0, + "step": 9521 + }, + { + "epoch": 2.574905354245538, + "grad_norm": 3.421875, + "learning_rate": 0.011002364733779508, + "loss": 3.2628, + "mean_token_accuracy": 0.3834080696105957, + "num_tokens": 4867943766.0, + "step": 9522 + }, + { + "epoch": 2.5751757706868577, + "grad_norm": 2.515625, + "learning_rate": 0.011000788244601233, + "loss": 3.0407, + "mean_token_accuracy": 0.40734177827835083, + "num_tokens": 4868412515.0, + "step": 9523 + }, + { + "epoch": 2.5754461871281773, + "grad_norm": 2.796875, + "learning_rate": 0.010999211755398768, + "loss": 3.0511, + "mean_token_accuracy": 0.4038541316986084, + "num_tokens": 4868891097.0, + "step": 9524 + }, + { + "epoch": 2.575716603569497, + "grad_norm": 2.84375, + "learning_rate": 0.010997635266220494, + "loss": 3.199, + "mean_token_accuracy": 0.40871942043304443, + "num_tokens": 4869315221.0, + "step": 9525 + }, + { + "epoch": 2.5759870200108166, + "grad_norm": 2.5, + "learning_rate": 0.010996058777114775, + "loss": 2.9304, + "mean_token_accuracy": 0.4194483757019043, + "num_tokens": 4869839431.0, + "step": 9526 + }, + { + "epoch": 2.5762574364521362, + "grad_norm": 2.921875, + "learning_rate": 0.010994482288129982, + "loss": 3.1973, + "mean_token_accuracy": 0.38188856840133667, + "num_tokens": 4870363574.0, + "step": 9527 + }, + { + "epoch": 2.576527852893456, + "grad_norm": 3.78125, + "learning_rate": 0.01099290579931449, + "loss": 3.0947, + "mean_token_accuracy": 0.4076162576675415, + "num_tokens": 4870856609.0, + "step": 9528 + }, + { + "epoch": 2.5767982693347755, + "grad_norm": 3.109375, + "learning_rate": 0.01099132931071667, + "loss": 3.1114, + "mean_token_accuracy": 0.39079704880714417, + "num_tokens": 4871380813.0, + "step": 9529 + }, + { + "epoch": 2.577068685776095, + "grad_norm": 3.328125, + "learning_rate": 0.010989752822384891, + "loss": 3.1464, + "mean_token_accuracy": 0.39048439264297485, + "num_tokens": 4871905099.0, + "step": 9530 + }, + { + "epoch": 2.577339102217415, + "grad_norm": 11.0, + "learning_rate": 0.010988176334367527, + "loss": 10.5028, + "mean_token_accuracy": 0.0007725837640464306, + "num_tokens": 4872378265.0, + "step": 9531 + }, + { + "epoch": 2.5776095186587344, + "grad_norm": 6.0, + "learning_rate": 0.010986599846712948, + "loss": 3.3165, + "mean_token_accuracy": 0.36761799454689026, + "num_tokens": 4872890440.0, + "step": 9532 + }, + { + "epoch": 2.577879935100054, + "grad_norm": 1.9375, + "learning_rate": 0.010985023359469522, + "loss": 3.1703, + "mean_token_accuracy": 0.38223257660865784, + "num_tokens": 4873414527.0, + "step": 9533 + }, + { + "epoch": 2.5781503515413737, + "grad_norm": 2.96875, + "learning_rate": 0.010983446872685624, + "loss": 3.2498, + "mean_token_accuracy": 0.40097862482070923, + "num_tokens": 4873903567.0, + "step": 9534 + }, + { + "epoch": 2.5784207679826934, + "grad_norm": 3.359375, + "learning_rate": 0.01098187038640963, + "loss": 2.9367, + "mean_token_accuracy": 0.3900802731513977, + "num_tokens": 4874410031.0, + "step": 9535 + }, + { + "epoch": 2.578691184424013, + "grad_norm": 2.875, + "learning_rate": 0.0109802939006899, + "loss": 3.0816, + "mean_token_accuracy": 0.42248860001564026, + "num_tokens": 4874868878.0, + "step": 9536 + }, + { + "epoch": 2.5789616008653327, + "grad_norm": 3.046875, + "learning_rate": 0.010978717415574813, + "loss": 3.3229, + "mean_token_accuracy": 0.40579408407211304, + "num_tokens": 4875329446.0, + "step": 9537 + }, + { + "epoch": 2.5792320173066523, + "grad_norm": 3.140625, + "learning_rate": 0.010977140931112737, + "loss": 3.1812, + "mean_token_accuracy": 0.40379422903060913, + "num_tokens": 4875853646.0, + "step": 9538 + }, + { + "epoch": 2.579502433747972, + "grad_norm": 2.28125, + "learning_rate": 0.010975564447352046, + "loss": 3.0371, + "mean_token_accuracy": 0.42109739780426025, + "num_tokens": 4876377795.0, + "step": 9539 + }, + { + "epoch": 2.5797728501892916, + "grad_norm": 2.96875, + "learning_rate": 0.010973987964341111, + "loss": 3.1548, + "mean_token_accuracy": 0.4030212163925171, + "num_tokens": 4876901968.0, + "step": 9540 + }, + { + "epoch": 2.5800432666306112, + "grad_norm": 3.421875, + "learning_rate": 0.010972411482128297, + "loss": 3.1115, + "mean_token_accuracy": 0.41833072900772095, + "num_tokens": 4877418450.0, + "step": 9541 + }, + { + "epoch": 2.580313683071931, + "grad_norm": 2.796875, + "learning_rate": 0.010970835000761982, + "loss": 3.0399, + "mean_token_accuracy": 0.40136370062828064, + "num_tokens": 4877942703.0, + "step": 9542 + }, + { + "epoch": 2.5805840995132505, + "grad_norm": 2.734375, + "learning_rate": 0.010969258520290539, + "loss": 3.1017, + "mean_token_accuracy": 0.41461503505706787, + "num_tokens": 4878454226.0, + "step": 9543 + }, + { + "epoch": 2.58085451595457, + "grad_norm": 2.515625, + "learning_rate": 0.01096768204076233, + "loss": 3.0825, + "mean_token_accuracy": 0.41535717248916626, + "num_tokens": 4878921583.0, + "step": 9544 + }, + { + "epoch": 2.58112493239589, + "grad_norm": 2.9375, + "learning_rate": 0.010966105562225735, + "loss": 2.7565, + "mean_token_accuracy": 0.4166457951068878, + "num_tokens": 4879445667.0, + "step": 9545 + }, + { + "epoch": 2.5813953488372094, + "grad_norm": 2.375, + "learning_rate": 0.01096452908472912, + "loss": 3.1596, + "mean_token_accuracy": 0.4004170894622803, + "num_tokens": 4879969885.0, + "step": 9546 + }, + { + "epoch": 2.581665765278529, + "grad_norm": 3.109375, + "learning_rate": 0.010962952608320854, + "loss": 2.951, + "mean_token_accuracy": 0.4052729308605194, + "num_tokens": 4880494171.0, + "step": 9547 + }, + { + "epoch": 2.5819361817198487, + "grad_norm": 3.140625, + "learning_rate": 0.010961376133049315, + "loss": 3.0922, + "mean_token_accuracy": 0.4203298091888428, + "num_tokens": 4881018401.0, + "step": 9548 + }, + { + "epoch": 2.582206598161168, + "grad_norm": 4.40625, + "learning_rate": 0.010959799658962866, + "loss": 3.2092, + "mean_token_accuracy": 0.4106093645095825, + "num_tokens": 4881520831.0, + "step": 9549 + }, + { + "epoch": 2.582477014602488, + "grad_norm": 3.046875, + "learning_rate": 0.01095822318610988, + "loss": 3.122, + "mean_token_accuracy": 0.41421622037887573, + "num_tokens": 4881947985.0, + "step": 9550 + }, + { + "epoch": 2.582747431043807, + "grad_norm": 21.625, + "learning_rate": 0.010956646714538736, + "loss": 9.3596, + "mean_token_accuracy": 0.005483011715114117, + "num_tokens": 4882432228.0, + "step": 9551 + }, + { + "epoch": 2.5830178474851273, + "grad_norm": 7.1875, + "learning_rate": 0.010955070244297798, + "loss": 3.2438, + "mean_token_accuracy": 0.3943372964859009, + "num_tokens": 4882956356.0, + "step": 9552 + }, + { + "epoch": 2.5832882639264465, + "grad_norm": 1.953125, + "learning_rate": 0.010953493775435431, + "loss": 3.2183, + "mean_token_accuracy": 0.3908853232860565, + "num_tokens": 4883480634.0, + "step": 9553 + }, + { + "epoch": 2.5835586803677666, + "grad_norm": 2.390625, + "learning_rate": 0.010951917308000016, + "loss": 2.9728, + "mean_token_accuracy": 0.41521039605140686, + "num_tokens": 4883949284.0, + "step": 9554 + }, + { + "epoch": 2.583829096809086, + "grad_norm": 2.875, + "learning_rate": 0.010950340842039918, + "loss": 3.2027, + "mean_token_accuracy": 0.3995642364025116, + "num_tokens": 4884473479.0, + "step": 9555 + }, + { + "epoch": 2.584099513250406, + "grad_norm": 2.40625, + "learning_rate": 0.010948764377603509, + "loss": 3.0524, + "mean_token_accuracy": 0.4271787703037262, + "num_tokens": 4884961313.0, + "step": 9556 + }, + { + "epoch": 2.584369929691725, + "grad_norm": 2.796875, + "learning_rate": 0.010947187914739158, + "loss": 3.2182, + "mean_token_accuracy": 0.36899644136428833, + "num_tokens": 4885485541.0, + "step": 9557 + }, + { + "epoch": 2.584640346133045, + "grad_norm": 2.34375, + "learning_rate": 0.01094561145349524, + "loss": 3.0979, + "mean_token_accuracy": 0.4059603810310364, + "num_tokens": 4886009771.0, + "step": 9558 + }, + { + "epoch": 2.5849107625743644, + "grad_norm": 3.40625, + "learning_rate": 0.01094403499392012, + "loss": 2.9883, + "mean_token_accuracy": 0.4121590852737427, + "num_tokens": 4886534024.0, + "step": 9559 + }, + { + "epoch": 2.585181179015684, + "grad_norm": 2.796875, + "learning_rate": 0.010942458536062174, + "loss": 3.101, + "mean_token_accuracy": 0.4049287736415863, + "num_tokens": 4887058152.0, + "step": 9560 + }, + { + "epoch": 2.5854515954570036, + "grad_norm": 4.1875, + "learning_rate": 0.010940882079969768, + "loss": 3.0562, + "mean_token_accuracy": 0.39167851209640503, + "num_tokens": 4887582262.0, + "step": 9561 + }, + { + "epoch": 2.5857220118983233, + "grad_norm": 2.75, + "learning_rate": 0.010939305625691273, + "loss": 2.9529, + "mean_token_accuracy": 0.39421355724334717, + "num_tokens": 4888106475.0, + "step": 9562 + }, + { + "epoch": 2.585992428339643, + "grad_norm": 3.4375, + "learning_rate": 0.010937729173275061, + "loss": 2.9454, + "mean_token_accuracy": 0.40835660696029663, + "num_tokens": 4888579200.0, + "step": 9563 + }, + { + "epoch": 2.5862628447809626, + "grad_norm": 2.046875, + "learning_rate": 0.010936152722769496, + "loss": 2.9987, + "mean_token_accuracy": 0.3995562195777893, + "num_tokens": 4889103427.0, + "step": 9564 + }, + { + "epoch": 2.586533261222282, + "grad_norm": 2.953125, + "learning_rate": 0.010934576274222963, + "loss": 3.1412, + "mean_token_accuracy": 0.38125619292259216, + "num_tokens": 4889590461.0, + "step": 9565 + }, + { + "epoch": 2.586803677663602, + "grad_norm": 2.5, + "learning_rate": 0.010932999827683816, + "loss": 3.0508, + "mean_token_accuracy": 0.40769702196121216, + "num_tokens": 4890114734.0, + "step": 9566 + }, + { + "epoch": 2.5870740941049215, + "grad_norm": 4.25, + "learning_rate": 0.010931423383200432, + "loss": 3.0584, + "mean_token_accuracy": 0.4114796817302704, + "num_tokens": 4890629250.0, + "step": 9567 + }, + { + "epoch": 2.587344510546241, + "grad_norm": 3.75, + "learning_rate": 0.010929846940821181, + "loss": 3.111, + "mean_token_accuracy": 0.3826899826526642, + "num_tokens": 4891153530.0, + "step": 9568 + }, + { + "epoch": 2.587614926987561, + "grad_norm": 3.21875, + "learning_rate": 0.010928270500594433, + "loss": 2.9006, + "mean_token_accuracy": 0.42386701703071594, + "num_tokens": 4891667107.0, + "step": 9569 + }, + { + "epoch": 2.5878853434288804, + "grad_norm": 3.0, + "learning_rate": 0.010926694062568553, + "loss": 2.9673, + "mean_token_accuracy": 0.4152470827102661, + "num_tokens": 4892178965.0, + "step": 9570 + }, + { + "epoch": 2.5881557598702, + "grad_norm": 33.0, + "learning_rate": 0.010925117626791919, + "loss": 9.8683, + "mean_token_accuracy": 0.027793439105153084, + "num_tokens": 4892703234.0, + "step": 9571 + }, + { + "epoch": 2.5884261763115197, + "grad_norm": 4.875, + "learning_rate": 0.010923541193312895, + "loss": 3.2458, + "mean_token_accuracy": 0.3733838200569153, + "num_tokens": 4893227438.0, + "step": 9572 + }, + { + "epoch": 2.5886965927528394, + "grad_norm": 1.7734375, + "learning_rate": 0.01092196476217985, + "loss": 2.9601, + "mean_token_accuracy": 0.4091910421848297, + "num_tokens": 4893751623.0, + "step": 9573 + }, + { + "epoch": 2.588967009194159, + "grad_norm": 2.34375, + "learning_rate": 0.01092038833344116, + "loss": 3.125, + "mean_token_accuracy": 0.4054238498210907, + "num_tokens": 4894275818.0, + "step": 9574 + }, + { + "epoch": 2.5892374256354787, + "grad_norm": 2.875, + "learning_rate": 0.01091881190714519, + "loss": 3.1284, + "mean_token_accuracy": 0.41867631673812866, + "num_tokens": 4894747257.0, + "step": 9575 + }, + { + "epoch": 2.5895078420767983, + "grad_norm": 5.4375, + "learning_rate": 0.010917235483340304, + "loss": 2.8087, + "mean_token_accuracy": 0.4663383662700653, + "num_tokens": 4895271544.0, + "step": 9576 + }, + { + "epoch": 2.589778258518118, + "grad_norm": 2.140625, + "learning_rate": 0.01091565906207488, + "loss": 3.1393, + "mean_token_accuracy": 0.4258711636066437, + "num_tokens": 4895747708.0, + "step": 9577 + }, + { + "epoch": 2.5900486749594376, + "grad_norm": 3.234375, + "learning_rate": 0.010914082643397285, + "loss": 3.2404, + "mean_token_accuracy": 0.3994235098361969, + "num_tokens": 4896271909.0, + "step": 9578 + }, + { + "epoch": 2.5903190914007572, + "grad_norm": 2.96875, + "learning_rate": 0.010912506227355883, + "loss": 2.9246, + "mean_token_accuracy": 0.4142444431781769, + "num_tokens": 4896796053.0, + "step": 9579 + }, + { + "epoch": 2.590589507842077, + "grad_norm": 2.734375, + "learning_rate": 0.01091092981399905, + "loss": 3.1288, + "mean_token_accuracy": 0.41038641333580017, + "num_tokens": 4897320290.0, + "step": 9580 + }, + { + "epoch": 2.5908599242833965, + "grad_norm": 3.5625, + "learning_rate": 0.010909353403375155, + "loss": 3.2199, + "mean_token_accuracy": 0.3982090353965759, + "num_tokens": 4897844559.0, + "step": 9581 + }, + { + "epoch": 2.591130340724716, + "grad_norm": 2.296875, + "learning_rate": 0.010907776995532558, + "loss": 2.9883, + "mean_token_accuracy": 0.40858787298202515, + "num_tokens": 4898343540.0, + "step": 9582 + }, + { + "epoch": 2.591400757166036, + "grad_norm": 3.140625, + "learning_rate": 0.010906200590519641, + "loss": 3.3056, + "mean_token_accuracy": 0.3877764642238617, + "num_tokens": 4898851269.0, + "step": 9583 + }, + { + "epoch": 2.5916711736073554, + "grad_norm": 2.921875, + "learning_rate": 0.010904624188384759, + "loss": 3.0626, + "mean_token_accuracy": 0.40989238023757935, + "num_tokens": 4899349653.0, + "step": 9584 + }, + { + "epoch": 2.591941590048675, + "grad_norm": 2.984375, + "learning_rate": 0.010903047789176293, + "loss": 3.2358, + "mean_token_accuracy": 0.40410250425338745, + "num_tokens": 4899873871.0, + "step": 9585 + }, + { + "epoch": 2.5922120064899947, + "grad_norm": 3.53125, + "learning_rate": 0.010901471392942603, + "loss": 2.9909, + "mean_token_accuracy": 0.41509222984313965, + "num_tokens": 4900398113.0, + "step": 9586 + }, + { + "epoch": 2.5924824229313144, + "grad_norm": 2.703125, + "learning_rate": 0.010899894999732058, + "loss": 3.0453, + "mean_token_accuracy": 0.40778931975364685, + "num_tokens": 4900922350.0, + "step": 9587 + }, + { + "epoch": 2.592752839372634, + "grad_norm": 2.765625, + "learning_rate": 0.010898318609593035, + "loss": 2.9356, + "mean_token_accuracy": 0.41518083214759827, + "num_tokens": 4901446441.0, + "step": 9588 + }, + { + "epoch": 2.5930232558139537, + "grad_norm": 4.53125, + "learning_rate": 0.010896742222573894, + "loss": 2.7882, + "mean_token_accuracy": 0.43330585956573486, + "num_tokens": 4901958772.0, + "step": 9589 + }, + { + "epoch": 2.593293672255273, + "grad_norm": 2.21875, + "learning_rate": 0.010895165838723003, + "loss": 3.0537, + "mean_token_accuracy": 0.3924775719642639, + "num_tokens": 4902482833.0, + "step": 9590 + }, + { + "epoch": 2.593564088696593, + "grad_norm": 45.25, + "learning_rate": 0.010893589458088736, + "loss": 11.2153, + "mean_token_accuracy": 0.005068940110504627, + "num_tokens": 4903006990.0, + "step": 9591 + }, + { + "epoch": 2.593834505137912, + "grad_norm": 9.0, + "learning_rate": 0.010892013080719454, + "loss": 3.5108, + "mean_token_accuracy": 0.33163315057754517, + "num_tokens": 4903530989.0, + "step": 9592 + }, + { + "epoch": 2.5941049215792322, + "grad_norm": 2.21875, + "learning_rate": 0.010890436706663526, + "loss": 3.0337, + "mean_token_accuracy": 0.4066067934036255, + "num_tokens": 4904055062.0, + "step": 9593 + }, + { + "epoch": 2.5943753380205514, + "grad_norm": 2.125, + "learning_rate": 0.010888860335969325, + "loss": 3.3014, + "mean_token_accuracy": 0.37635138630867004, + "num_tokens": 4904579289.0, + "step": 9594 + }, + { + "epoch": 2.5946457544618715, + "grad_norm": 2.625, + "learning_rate": 0.01088728396868522, + "loss": 3.2073, + "mean_token_accuracy": 0.39276525378227234, + "num_tokens": 4905103541.0, + "step": 9595 + }, + { + "epoch": 2.5949161709031907, + "grad_norm": 2.3125, + "learning_rate": 0.010885707604859567, + "loss": 3.0024, + "mean_token_accuracy": 0.42716971039772034, + "num_tokens": 4905574123.0, + "step": 9596 + }, + { + "epoch": 2.595186587344511, + "grad_norm": 2.453125, + "learning_rate": 0.010884131244540747, + "loss": 3.0562, + "mean_token_accuracy": 0.3991249203681946, + "num_tokens": 4906098245.0, + "step": 9597 + }, + { + "epoch": 2.59545700378583, + "grad_norm": 3.140625, + "learning_rate": 0.010882554887777118, + "loss": 3.211, + "mean_token_accuracy": 0.39991700649261475, + "num_tokens": 4906622494.0, + "step": 9598 + }, + { + "epoch": 2.59572742022715, + "grad_norm": 3.125, + "learning_rate": 0.010880978534617048, + "loss": 3.2427, + "mean_token_accuracy": 0.3945407271385193, + "num_tokens": 4907146658.0, + "step": 9599 + }, + { + "epoch": 2.5959978366684693, + "grad_norm": 3.265625, + "learning_rate": 0.01087940218510891, + "loss": 3.129, + "mean_token_accuracy": 0.4097113013267517, + "num_tokens": 4907670791.0, + "step": 9600 + }, + { + "epoch": 2.596268253109789, + "grad_norm": 3.390625, + "learning_rate": 0.01087782583930107, + "loss": 3.2082, + "mean_token_accuracy": 0.39972102642059326, + "num_tokens": 4908194857.0, + "step": 9601 + }, + { + "epoch": 2.5965386695511086, + "grad_norm": 7.34375, + "learning_rate": 0.010876249497241886, + "loss": 3.0211, + "mean_token_accuracy": 0.39852988719940186, + "num_tokens": 4908665593.0, + "step": 9602 + }, + { + "epoch": 2.596809085992428, + "grad_norm": 2.15625, + "learning_rate": 0.010874673158979737, + "loss": 3.3408, + "mean_token_accuracy": 0.4104638695716858, + "num_tokens": 4909166228.0, + "step": 9603 + }, + { + "epoch": 2.597079502433748, + "grad_norm": 3.109375, + "learning_rate": 0.010873096824562981, + "loss": 3.07, + "mean_token_accuracy": 0.3888111710548401, + "num_tokens": 4909690456.0, + "step": 9604 + }, + { + "epoch": 2.5973499188750675, + "grad_norm": 2.875, + "learning_rate": 0.010871520494039989, + "loss": 3.0411, + "mean_token_accuracy": 0.4088535010814667, + "num_tokens": 4910214686.0, + "step": 9605 + }, + { + "epoch": 2.597620335316387, + "grad_norm": 3.828125, + "learning_rate": 0.010869944167459124, + "loss": 3.1862, + "mean_token_accuracy": 0.39905911684036255, + "num_tokens": 4910738917.0, + "step": 9606 + }, + { + "epoch": 2.597890751757707, + "grad_norm": 2.90625, + "learning_rate": 0.010868367844868753, + "loss": 2.9813, + "mean_token_accuracy": 0.4036731719970703, + "num_tokens": 4911263183.0, + "step": 9607 + }, + { + "epoch": 2.5981611681990264, + "grad_norm": 2.796875, + "learning_rate": 0.010866791526317247, + "loss": 3.2176, + "mean_token_accuracy": 0.38550105690956116, + "num_tokens": 4911787320.0, + "step": 9608 + }, + { + "epoch": 2.598431584640346, + "grad_norm": 2.875, + "learning_rate": 0.010865215211852972, + "loss": 3.0408, + "mean_token_accuracy": 0.4238525331020355, + "num_tokens": 4912311599.0, + "step": 9609 + }, + { + "epoch": 2.5987020010816657, + "grad_norm": 2.828125, + "learning_rate": 0.010863638901524284, + "loss": 2.9545, + "mean_token_accuracy": 0.40135473012924194, + "num_tokens": 4912835870.0, + "step": 9610 + }, + { + "epoch": 2.5989724175229854, + "grad_norm": 11.5625, + "learning_rate": 0.01086206259537956, + "loss": 10.4588, + "mean_token_accuracy": 0.0002669933019205928, + "num_tokens": 4913360144.0, + "step": 9611 + }, + { + "epoch": 2.599242833964305, + "grad_norm": 6.4375, + "learning_rate": 0.01086048629346716, + "loss": 3.4776, + "mean_token_accuracy": 0.392616868019104, + "num_tokens": 4913830371.0, + "step": 9612 + }, + { + "epoch": 2.5995132504056246, + "grad_norm": 2.953125, + "learning_rate": 0.010858909995835449, + "loss": 3.2364, + "mean_token_accuracy": 0.3697076737880707, + "num_tokens": 4914354623.0, + "step": 9613 + }, + { + "epoch": 2.5997836668469443, + "grad_norm": 3.40625, + "learning_rate": 0.010857333702532796, + "loss": 2.896, + "mean_token_accuracy": 0.4418524205684662, + "num_tokens": 4914878852.0, + "step": 9614 + }, + { + "epoch": 2.600054083288264, + "grad_norm": 2.921875, + "learning_rate": 0.010855757413607567, + "loss": 3.0245, + "mean_token_accuracy": 0.37604013085365295, + "num_tokens": 4915402990.0, + "step": 9615 + }, + { + "epoch": 2.6003244997295836, + "grad_norm": 3.359375, + "learning_rate": 0.010854181129108121, + "loss": 3.1444, + "mean_token_accuracy": 0.40440791845321655, + "num_tokens": 4915927264.0, + "step": 9616 + }, + { + "epoch": 2.600594916170903, + "grad_norm": 3.453125, + "learning_rate": 0.010852604849082828, + "loss": 3.2155, + "mean_token_accuracy": 0.4071803689002991, + "num_tokens": 4916405273.0, + "step": 9617 + }, + { + "epoch": 2.600865332612223, + "grad_norm": 3.0, + "learning_rate": 0.010851028573580054, + "loss": 3.1558, + "mean_token_accuracy": 0.40955501794815063, + "num_tokens": 4916919572.0, + "step": 9618 + }, + { + "epoch": 2.6011357490535425, + "grad_norm": 2.671875, + "learning_rate": 0.010849452302648156, + "loss": 3.2581, + "mean_token_accuracy": 0.39179688692092896, + "num_tokens": 4917443815.0, + "step": 9619 + }, + { + "epoch": 2.601406165494862, + "grad_norm": 2.296875, + "learning_rate": 0.010847876036335507, + "loss": 3.0159, + "mean_token_accuracy": 0.41244345903396606, + "num_tokens": 4917968006.0, + "step": 9620 + }, + { + "epoch": 2.601676581936182, + "grad_norm": 3.421875, + "learning_rate": 0.010846299774690468, + "loss": 3.1989, + "mean_token_accuracy": 0.4135143756866455, + "num_tokens": 4918414206.0, + "step": 9621 + }, + { + "epoch": 2.6019469983775014, + "grad_norm": 3.15625, + "learning_rate": 0.010844723517761404, + "loss": 3.0745, + "mean_token_accuracy": 0.407245934009552, + "num_tokens": 4918895769.0, + "step": 9622 + }, + { + "epoch": 2.602217414818821, + "grad_norm": 2.9375, + "learning_rate": 0.01084314726559668, + "loss": 3.0418, + "mean_token_accuracy": 0.3992210328578949, + "num_tokens": 4919419967.0, + "step": 9623 + }, + { + "epoch": 2.6024878312601407, + "grad_norm": 2.40625, + "learning_rate": 0.01084157101824466, + "loss": 2.9541, + "mean_token_accuracy": 0.44541865587234497, + "num_tokens": 4919944212.0, + "step": 9624 + }, + { + "epoch": 2.6027582477014604, + "grad_norm": 2.75, + "learning_rate": 0.010839994775753705, + "loss": 3.2977, + "mean_token_accuracy": 0.3841380476951599, + "num_tokens": 4920468495.0, + "step": 9625 + }, + { + "epoch": 2.60302866414278, + "grad_norm": 3.09375, + "learning_rate": 0.010838418538172183, + "loss": 3.1561, + "mean_token_accuracy": 0.41765958070755005, + "num_tokens": 4920992769.0, + "step": 9626 + }, + { + "epoch": 2.6032990805840996, + "grad_norm": 2.84375, + "learning_rate": 0.010836842305548454, + "loss": 3.0998, + "mean_token_accuracy": 0.39493322372436523, + "num_tokens": 4921421157.0, + "step": 9627 + }, + { + "epoch": 2.6035694970254193, + "grad_norm": 2.421875, + "learning_rate": 0.01083526607793088, + "loss": 3.0979, + "mean_token_accuracy": 0.4273526668548584, + "num_tokens": 4921904438.0, + "step": 9628 + }, + { + "epoch": 2.603839913466739, + "grad_norm": 3.40625, + "learning_rate": 0.010833689855367833, + "loss": 3.0303, + "mean_token_accuracy": 0.41801711916923523, + "num_tokens": 4922414927.0, + "step": 9629 + }, + { + "epoch": 2.6041103299080586, + "grad_norm": 3.0, + "learning_rate": 0.010832113637907665, + "loss": 3.0595, + "mean_token_accuracy": 0.4006042778491974, + "num_tokens": 4922904080.0, + "step": 9630 + }, + { + "epoch": 2.6043807463493778, + "grad_norm": 23.75, + "learning_rate": 0.010830537425598748, + "loss": 10.4596, + "mean_token_accuracy": 0.011673130095005035, + "num_tokens": 4923428232.0, + "step": 9631 + }, + { + "epoch": 2.604651162790698, + "grad_norm": 7.40625, + "learning_rate": 0.010828961218489442, + "loss": 3.4796, + "mean_token_accuracy": 0.38904982805252075, + "num_tokens": 4923847291.0, + "step": 9632 + }, + { + "epoch": 2.604921579232017, + "grad_norm": 2.09375, + "learning_rate": 0.010827385016628104, + "loss": 3.2646, + "mean_token_accuracy": 0.39030957221984863, + "num_tokens": 4924331802.0, + "step": 9633 + }, + { + "epoch": 2.605191995673337, + "grad_norm": 2.5625, + "learning_rate": 0.010825808820063103, + "loss": 3.1346, + "mean_token_accuracy": 0.3814525008201599, + "num_tokens": 4924855994.0, + "step": 9634 + }, + { + "epoch": 2.6054624121146563, + "grad_norm": 3.390625, + "learning_rate": 0.010824232628842802, + "loss": 3.1789, + "mean_token_accuracy": 0.4020001292228699, + "num_tokens": 4925380265.0, + "step": 9635 + }, + { + "epoch": 2.6057328285559764, + "grad_norm": 2.71875, + "learning_rate": 0.01082265644301556, + "loss": 3.0418, + "mean_token_accuracy": 0.42629915475845337, + "num_tokens": 4925904505.0, + "step": 9636 + }, + { + "epoch": 2.6060032449972956, + "grad_norm": 3.09375, + "learning_rate": 0.01082108026262974, + "loss": 3.0407, + "mean_token_accuracy": 0.4209447503089905, + "num_tokens": 4926428727.0, + "step": 9637 + }, + { + "epoch": 2.6062736614386157, + "grad_norm": 3.546875, + "learning_rate": 0.010819504087733705, + "loss": 2.9422, + "mean_token_accuracy": 0.40354543924331665, + "num_tokens": 4926952950.0, + "step": 9638 + }, + { + "epoch": 2.606544077879935, + "grad_norm": 3.4375, + "learning_rate": 0.010817927918375811, + "loss": 3.1837, + "mean_token_accuracy": 0.39465636014938354, + "num_tokens": 4927477189.0, + "step": 9639 + }, + { + "epoch": 2.606814494321255, + "grad_norm": 2.875, + "learning_rate": 0.010816351754604428, + "loss": 3.1044, + "mean_token_accuracy": 0.4042190909385681, + "num_tokens": 4928001428.0, + "step": 9640 + }, + { + "epoch": 2.607084910762574, + "grad_norm": 2.53125, + "learning_rate": 0.010814775596467912, + "loss": 3.1528, + "mean_token_accuracy": 0.4106277823448181, + "num_tokens": 4928487238.0, + "step": 9641 + }, + { + "epoch": 2.607355327203894, + "grad_norm": 2.890625, + "learning_rate": 0.010813199444014623, + "loss": 3.0748, + "mean_token_accuracy": 0.39689603447914124, + "num_tokens": 4929011494.0, + "step": 9642 + }, + { + "epoch": 2.6076257436452135, + "grad_norm": 2.390625, + "learning_rate": 0.010811623297292927, + "loss": 3.0996, + "mean_token_accuracy": 0.404202401638031, + "num_tokens": 4929535771.0, + "step": 9643 + }, + { + "epoch": 2.607896160086533, + "grad_norm": 3.15625, + "learning_rate": 0.010810047156351183, + "loss": 3.0989, + "mean_token_accuracy": 0.39716172218322754, + "num_tokens": 4930060027.0, + "step": 9644 + }, + { + "epoch": 2.6081665765278528, + "grad_norm": 2.734375, + "learning_rate": 0.010808471021237746, + "loss": 3.0565, + "mean_token_accuracy": 0.40595340728759766, + "num_tokens": 4930584169.0, + "step": 9645 + }, + { + "epoch": 2.6084369929691724, + "grad_norm": 3.8125, + "learning_rate": 0.01080689489200099, + "loss": 3.2462, + "mean_token_accuracy": 0.4007900357246399, + "num_tokens": 4931108360.0, + "step": 9646 + }, + { + "epoch": 2.608707409410492, + "grad_norm": 2.890625, + "learning_rate": 0.010805318768689261, + "loss": 3.1244, + "mean_token_accuracy": 0.41235530376434326, + "num_tokens": 4931632585.0, + "step": 9647 + }, + { + "epoch": 2.6089778258518117, + "grad_norm": 2.328125, + "learning_rate": 0.010803742651350923, + "loss": 2.9936, + "mean_token_accuracy": 0.41258180141448975, + "num_tokens": 4932156798.0, + "step": 9648 + }, + { + "epoch": 2.6092482422931313, + "grad_norm": 2.78125, + "learning_rate": 0.010802166540034339, + "loss": 2.8065, + "mean_token_accuracy": 0.43717992305755615, + "num_tokens": 4932680977.0, + "step": 9649 + }, + { + "epoch": 2.609518658734451, + "grad_norm": 2.96875, + "learning_rate": 0.010800590434787871, + "loss": 3.1335, + "mean_token_accuracy": 0.40445083379745483, + "num_tokens": 4933193081.0, + "step": 9650 + }, + { + "epoch": 2.6097890751757706, + "grad_norm": 131.0, + "learning_rate": 0.010799014335659869, + "loss": 15.3835, + "mean_token_accuracy": 0.00023592091747559607, + "num_tokens": 4933717313.0, + "step": 9651 + }, + { + "epoch": 2.6100594916170903, + "grad_norm": 5.75, + "learning_rate": 0.010797438242698703, + "loss": 3.566, + "mean_token_accuracy": 0.33586427569389343, + "num_tokens": 4934241565.0, + "step": 9652 + }, + { + "epoch": 2.61032990805841, + "grad_norm": 2.5, + "learning_rate": 0.010795862155952723, + "loss": 3.2316, + "mean_token_accuracy": 0.38313812017440796, + "num_tokens": 4934765826.0, + "step": 9653 + }, + { + "epoch": 2.6106003244997296, + "grad_norm": 2.46875, + "learning_rate": 0.010794286075470297, + "loss": 3.1966, + "mean_token_accuracy": 0.4068693220615387, + "num_tokens": 4935290110.0, + "step": 9654 + }, + { + "epoch": 2.610870740941049, + "grad_norm": 2.859375, + "learning_rate": 0.010792710001299777, + "loss": 3.0672, + "mean_token_accuracy": 0.4103319048881531, + "num_tokens": 4935814349.0, + "step": 9655 + }, + { + "epoch": 2.611141157382369, + "grad_norm": 2.703125, + "learning_rate": 0.010791133933489523, + "loss": 3.3051, + "mean_token_accuracy": 0.3859453797340393, + "num_tokens": 4936296223.0, + "step": 9656 + }, + { + "epoch": 2.6114115738236885, + "grad_norm": 3.421875, + "learning_rate": 0.010789557872087897, + "loss": 2.9747, + "mean_token_accuracy": 0.4117445945739746, + "num_tokens": 4936820320.0, + "step": 9657 + }, + { + "epoch": 2.611681990265008, + "grad_norm": 2.703125, + "learning_rate": 0.010787981817143253, + "loss": 2.9826, + "mean_token_accuracy": 0.42195796966552734, + "num_tokens": 4937293237.0, + "step": 9658 + }, + { + "epoch": 2.611952406706328, + "grad_norm": 3.15625, + "learning_rate": 0.010786405768703948, + "loss": 3.1689, + "mean_token_accuracy": 0.41197794675827026, + "num_tokens": 4937735678.0, + "step": 9659 + }, + { + "epoch": 2.6122228231476474, + "grad_norm": 2.34375, + "learning_rate": 0.010784829726818346, + "loss": 2.8629, + "mean_token_accuracy": 0.4228496253490448, + "num_tokens": 4938259670.0, + "step": 9660 + }, + { + "epoch": 2.612493239588967, + "grad_norm": 2.625, + "learning_rate": 0.010783253691534798, + "loss": 3.1386, + "mean_token_accuracy": 0.3995892107486725, + "num_tokens": 4938783839.0, + "step": 9661 + }, + { + "epoch": 2.6127636560302867, + "grad_norm": 3.03125, + "learning_rate": 0.010781677662901664, + "loss": 2.7992, + "mean_token_accuracy": 0.4083700180053711, + "num_tokens": 4939308075.0, + "step": 9662 + }, + { + "epoch": 2.6130340724716064, + "grad_norm": 3.0625, + "learning_rate": 0.010780101640967302, + "loss": 2.884, + "mean_token_accuracy": 0.4211534857749939, + "num_tokens": 4939832239.0, + "step": 9663 + }, + { + "epoch": 2.613304488912926, + "grad_norm": 2.875, + "learning_rate": 0.01077852562578007, + "loss": 2.9884, + "mean_token_accuracy": 0.421470046043396, + "num_tokens": 4940356475.0, + "step": 9664 + }, + { + "epoch": 2.6135749053542456, + "grad_norm": 2.515625, + "learning_rate": 0.01077694961738832, + "loss": 3.0902, + "mean_token_accuracy": 0.4038379192352295, + "num_tokens": 4940880746.0, + "step": 9665 + }, + { + "epoch": 2.6138453217955653, + "grad_norm": 3.390625, + "learning_rate": 0.010775373615840416, + "loss": 3.2579, + "mean_token_accuracy": 0.378680944442749, + "num_tokens": 4941404934.0, + "step": 9666 + }, + { + "epoch": 2.614115738236885, + "grad_norm": 2.40625, + "learning_rate": 0.010773797621184706, + "loss": 3.0422, + "mean_token_accuracy": 0.40694931149482727, + "num_tokens": 4941929028.0, + "step": 9667 + }, + { + "epoch": 2.6143861546782046, + "grad_norm": 2.84375, + "learning_rate": 0.010772221633469551, + "loss": 3.2061, + "mean_token_accuracy": 0.412716269493103, + "num_tokens": 4942397054.0, + "step": 9668 + }, + { + "epoch": 2.614656571119524, + "grad_norm": 2.609375, + "learning_rate": 0.010770645652743308, + "loss": 2.9242, + "mean_token_accuracy": 0.4273233413696289, + "num_tokens": 4942921276.0, + "step": 9669 + }, + { + "epoch": 2.614926987560844, + "grad_norm": 3.3125, + "learning_rate": 0.010769069679054331, + "loss": 3.065, + "mean_token_accuracy": 0.4046638309955597, + "num_tokens": 4943445552.0, + "step": 9670 + }, + { + "epoch": 2.6151974040021635, + "grad_norm": 45.25, + "learning_rate": 0.010767493712450971, + "loss": 10.7421, + "mean_token_accuracy": 1.0446331543789711e-05, + "num_tokens": 4943969691.0, + "step": 9671 + }, + { + "epoch": 2.6154678204434827, + "grad_norm": 6.21875, + "learning_rate": 0.010765917752981594, + "loss": 3.3858, + "mean_token_accuracy": 0.4022524952888489, + "num_tokens": 4944493747.0, + "step": 9672 + }, + { + "epoch": 2.615738236884803, + "grad_norm": 2.765625, + "learning_rate": 0.010764341800694546, + "loss": 3.0168, + "mean_token_accuracy": 0.407509982585907, + "num_tokens": 4944981897.0, + "step": 9673 + }, + { + "epoch": 2.616008653326122, + "grad_norm": 3.09375, + "learning_rate": 0.01076276585563818, + "loss": 3.1796, + "mean_token_accuracy": 0.40779179334640503, + "num_tokens": 4945449646.0, + "step": 9674 + }, + { + "epoch": 2.616279069767442, + "grad_norm": 3.40625, + "learning_rate": 0.010761189917860861, + "loss": 3.2087, + "mean_token_accuracy": 0.3977026343345642, + "num_tokens": 4945968398.0, + "step": 9675 + }, + { + "epoch": 2.6165494862087613, + "grad_norm": 2.953125, + "learning_rate": 0.010759613987410934, + "loss": 3.2297, + "mean_token_accuracy": 0.3930169641971588, + "num_tokens": 4946492497.0, + "step": 9676 + }, + { + "epoch": 2.6168199026500814, + "grad_norm": 3.03125, + "learning_rate": 0.01075803806433676, + "loss": 3.1288, + "mean_token_accuracy": 0.38768163323402405, + "num_tokens": 4947016621.0, + "step": 9677 + }, + { + "epoch": 2.6170903190914006, + "grad_norm": 2.734375, + "learning_rate": 0.010756462148686688, + "loss": 3.0896, + "mean_token_accuracy": 0.38508719205856323, + "num_tokens": 4947540799.0, + "step": 9678 + }, + { + "epoch": 2.6173607355327206, + "grad_norm": 2.71875, + "learning_rate": 0.010754886240509073, + "loss": 3.0081, + "mean_token_accuracy": 0.41000896692276, + "num_tokens": 4948064834.0, + "step": 9679 + }, + { + "epoch": 2.61763115197404, + "grad_norm": 2.546875, + "learning_rate": 0.010753310339852272, + "loss": 2.9546, + "mean_token_accuracy": 0.41944414377212524, + "num_tokens": 4948560909.0, + "step": 9680 + }, + { + "epoch": 2.61790156841536, + "grad_norm": 2.640625, + "learning_rate": 0.010751734446764635, + "loss": 3.0185, + "mean_token_accuracy": 0.40984806418418884, + "num_tokens": 4949085168.0, + "step": 9681 + }, + { + "epoch": 2.618171984856679, + "grad_norm": 2.53125, + "learning_rate": 0.01075015856129451, + "loss": 3.0068, + "mean_token_accuracy": 0.42165887355804443, + "num_tokens": 4949609231.0, + "step": 9682 + }, + { + "epoch": 2.6184424012979988, + "grad_norm": 3.9375, + "learning_rate": 0.01074858268349026, + "loss": 3.175, + "mean_token_accuracy": 0.4069364368915558, + "num_tokens": 4950133357.0, + "step": 9683 + }, + { + "epoch": 2.6187128177393184, + "grad_norm": 3.625, + "learning_rate": 0.01074700681340023, + "loss": 3.1354, + "mean_token_accuracy": 0.40132856369018555, + "num_tokens": 4950657630.0, + "step": 9684 + }, + { + "epoch": 2.618983234180638, + "grad_norm": 2.4375, + "learning_rate": 0.010745430951072775, + "loss": 2.951, + "mean_token_accuracy": 0.42032504081726074, + "num_tokens": 4951181844.0, + "step": 9685 + }, + { + "epoch": 2.6192536506219577, + "grad_norm": 3.125, + "learning_rate": 0.010743855096556247, + "loss": 3.2066, + "mean_token_accuracy": 0.3806151747703552, + "num_tokens": 4951705890.0, + "step": 9686 + }, + { + "epoch": 2.6195240670632773, + "grad_norm": 2.484375, + "learning_rate": 0.010742279249899003, + "loss": 3.0987, + "mean_token_accuracy": 0.40112704038619995, + "num_tokens": 4952230161.0, + "step": 9687 + }, + { + "epoch": 2.619794483504597, + "grad_norm": 3.15625, + "learning_rate": 0.010740703411149381, + "loss": 3.1422, + "mean_token_accuracy": 0.4149491786956787, + "num_tokens": 4952754339.0, + "step": 9688 + }, + { + "epoch": 2.6200648999459166, + "grad_norm": 2.6875, + "learning_rate": 0.010739127580355747, + "loss": 3.2139, + "mean_token_accuracy": 0.3755704462528229, + "num_tokens": 4953278593.0, + "step": 9689 + }, + { + "epoch": 2.6203353163872363, + "grad_norm": 3.484375, + "learning_rate": 0.010737551757566447, + "loss": 3.3021, + "mean_token_accuracy": 0.38509708642959595, + "num_tokens": 4953782378.0, + "step": 9690 + }, + { + "epoch": 2.620605732828556, + "grad_norm": 6.34375, + "learning_rate": 0.010735975942829828, + "loss": 10.702, + "mean_token_accuracy": 0.0, + "num_tokens": 4954306629.0, + "step": 9691 + }, + { + "epoch": 2.6208761492698756, + "grad_norm": 8.125, + "learning_rate": 0.010734400136194241, + "loss": 3.5023, + "mean_token_accuracy": 0.3395019769668579, + "num_tokens": 4954830819.0, + "step": 9692 + }, + { + "epoch": 2.621146565711195, + "grad_norm": 3.4375, + "learning_rate": 0.010732824337708044, + "loss": 2.8677, + "mean_token_accuracy": 0.46098047494888306, + "num_tokens": 4955316160.0, + "step": 9693 + }, + { + "epoch": 2.621416982152515, + "grad_norm": 2.71875, + "learning_rate": 0.010731248547419577, + "loss": 3.2708, + "mean_token_accuracy": 0.3878551721572876, + "num_tokens": 4955840365.0, + "step": 9694 + }, + { + "epoch": 2.6216873985938345, + "grad_norm": 3.5, + "learning_rate": 0.0107296727653772, + "loss": 3.2018, + "mean_token_accuracy": 0.3867090046405792, + "num_tokens": 4956364529.0, + "step": 9695 + }, + { + "epoch": 2.621957815035154, + "grad_norm": 3.078125, + "learning_rate": 0.010728096991629256, + "loss": 3.2519, + "mean_token_accuracy": 0.3982343077659607, + "num_tokens": 4956888728.0, + "step": 9696 + }, + { + "epoch": 2.6222282314764738, + "grad_norm": 3.703125, + "learning_rate": 0.010726521226224093, + "loss": 3.0645, + "mean_token_accuracy": 0.38847798109054565, + "num_tokens": 4957412944.0, + "step": 9697 + }, + { + "epoch": 2.6224986479177934, + "grad_norm": 2.625, + "learning_rate": 0.010724945469210066, + "loss": 3.1648, + "mean_token_accuracy": 0.3950621783733368, + "num_tokens": 4957937127.0, + "step": 9698 + }, + { + "epoch": 2.622769064359113, + "grad_norm": 3.0625, + "learning_rate": 0.010723369720635516, + "loss": 2.8282, + "mean_token_accuracy": 0.4604308605194092, + "num_tokens": 4958461319.0, + "step": 9699 + }, + { + "epoch": 2.6230394808004327, + "grad_norm": 3.34375, + "learning_rate": 0.010721793980548804, + "loss": 2.9968, + "mean_token_accuracy": 0.38355425000190735, + "num_tokens": 4958985519.0, + "step": 9700 + }, + { + "epoch": 2.6233098972417523, + "grad_norm": 2.53125, + "learning_rate": 0.010720218248998268, + "loss": 3.1018, + "mean_token_accuracy": 0.41849011182785034, + "num_tokens": 4959470810.0, + "step": 9701 + }, + { + "epoch": 2.623580313683072, + "grad_norm": 4.0625, + "learning_rate": 0.010718642526032256, + "loss": 3.2618, + "mean_token_accuracy": 0.39015957713127136, + "num_tokens": 4959995071.0, + "step": 9702 + }, + { + "epoch": 2.6238507301243916, + "grad_norm": 2.4375, + "learning_rate": 0.010717066811699124, + "loss": 3.1281, + "mean_token_accuracy": 0.402789831161499, + "num_tokens": 4960519346.0, + "step": 9703 + }, + { + "epoch": 2.6241211465657113, + "grad_norm": 3.234375, + "learning_rate": 0.01071549110604721, + "loss": 3.1561, + "mean_token_accuracy": 0.4115315079689026, + "num_tokens": 4961043621.0, + "step": 9704 + }, + { + "epoch": 2.624391563007031, + "grad_norm": 2.53125, + "learning_rate": 0.010713915409124865, + "loss": 3.1352, + "mean_token_accuracy": 0.4189267158508301, + "num_tokens": 4961567817.0, + "step": 9705 + }, + { + "epoch": 2.6246619794483506, + "grad_norm": 3.171875, + "learning_rate": 0.01071233972098044, + "loss": 3.1625, + "mean_token_accuracy": 0.415533185005188, + "num_tokens": 4961956081.0, + "step": 9706 + }, + { + "epoch": 2.62493239588967, + "grad_norm": 2.921875, + "learning_rate": 0.010710764041662277, + "loss": 2.972, + "mean_token_accuracy": 0.4417160749435425, + "num_tokens": 4962415575.0, + "step": 9707 + }, + { + "epoch": 2.62520281233099, + "grad_norm": 3.03125, + "learning_rate": 0.01070918837121872, + "loss": 2.9895, + "mean_token_accuracy": 0.4134216010570526, + "num_tokens": 4962939807.0, + "step": 9708 + }, + { + "epoch": 2.6254732287723095, + "grad_norm": 2.71875, + "learning_rate": 0.010707612709698125, + "loss": 3.0926, + "mean_token_accuracy": 0.3922916054725647, + "num_tokens": 4963464019.0, + "step": 9709 + }, + { + "epoch": 2.625743645213629, + "grad_norm": 3.125, + "learning_rate": 0.010706037057148832, + "loss": 3.0349, + "mean_token_accuracy": 0.4042954444885254, + "num_tokens": 4963988088.0, + "step": 9710 + }, + { + "epoch": 2.6260140616549488, + "grad_norm": 37.25, + "learning_rate": 0.010704461413619182, + "loss": 17.2771, + "mean_token_accuracy": 0.0368216447532177, + "num_tokens": 4964512353.0, + "step": 9711 + }, + { + "epoch": 2.6262844780962684, + "grad_norm": 6.03125, + "learning_rate": 0.010702885779157525, + "loss": 3.3231, + "mean_token_accuracy": 0.3752431273460388, + "num_tokens": 4965036617.0, + "step": 9712 + }, + { + "epoch": 2.6265548945375876, + "grad_norm": 2.6875, + "learning_rate": 0.010701310153812212, + "loss": 2.9612, + "mean_token_accuracy": 0.4467984735965729, + "num_tokens": 4965560659.0, + "step": 9713 + }, + { + "epoch": 2.6268253109789077, + "grad_norm": 2.4375, + "learning_rate": 0.010699734537631574, + "loss": 3.0115, + "mean_token_accuracy": 0.4068174958229065, + "num_tokens": 4966084807.0, + "step": 9714 + }, + { + "epoch": 2.627095727420227, + "grad_norm": 3.234375, + "learning_rate": 0.010698158930663969, + "loss": 3.022, + "mean_token_accuracy": 0.4109669625759125, + "num_tokens": 4966609086.0, + "step": 9715 + }, + { + "epoch": 2.627366143861547, + "grad_norm": 2.78125, + "learning_rate": 0.010696583332957737, + "loss": 3.0457, + "mean_token_accuracy": 0.39134109020233154, + "num_tokens": 4967133318.0, + "step": 9716 + }, + { + "epoch": 2.627636560302866, + "grad_norm": 3.359375, + "learning_rate": 0.010695007744561214, + "loss": 3.0438, + "mean_token_accuracy": 0.4410633444786072, + "num_tokens": 4967618491.0, + "step": 9717 + }, + { + "epoch": 2.6279069767441863, + "grad_norm": 1.96875, + "learning_rate": 0.010693432165522756, + "loss": 2.9443, + "mean_token_accuracy": 0.4060666561126709, + "num_tokens": 4968103742.0, + "step": 9718 + }, + { + "epoch": 2.6281773931855055, + "grad_norm": 2.3125, + "learning_rate": 0.0106918565958907, + "loss": 2.9075, + "mean_token_accuracy": 0.40994471311569214, + "num_tokens": 4968627968.0, + "step": 9719 + }, + { + "epoch": 2.6284478096268256, + "grad_norm": 2.5625, + "learning_rate": 0.010690281035713383, + "loss": 3.0502, + "mean_token_accuracy": 0.4087819457054138, + "num_tokens": 4969152199.0, + "step": 9720 + }, + { + "epoch": 2.6287182260681448, + "grad_norm": 3.03125, + "learning_rate": 0.010688705485039163, + "loss": 3.0693, + "mean_token_accuracy": 0.41522902250289917, + "num_tokens": 4969624630.0, + "step": 9721 + }, + { + "epoch": 2.628988642509465, + "grad_norm": 2.40625, + "learning_rate": 0.010687129943916369, + "loss": 3.1818, + "mean_token_accuracy": 0.3971215486526489, + "num_tokens": 4970148817.0, + "step": 9722 + }, + { + "epoch": 2.629259058950784, + "grad_norm": 3.015625, + "learning_rate": 0.010685554412393352, + "loss": 3.1262, + "mean_token_accuracy": 0.4096354842185974, + "num_tokens": 4970673096.0, + "step": 9723 + }, + { + "epoch": 2.6295294753921037, + "grad_norm": 2.46875, + "learning_rate": 0.010683978890518451, + "loss": 3.2159, + "mean_token_accuracy": 0.3947794437408447, + "num_tokens": 4971197344.0, + "step": 9724 + }, + { + "epoch": 2.6297998918334233, + "grad_norm": 2.53125, + "learning_rate": 0.010682403378340002, + "loss": 3.0386, + "mean_token_accuracy": 0.4086512327194214, + "num_tokens": 4971694997.0, + "step": 9725 + }, + { + "epoch": 2.630070308274743, + "grad_norm": 2.125, + "learning_rate": 0.010680827875906357, + "loss": 3.0506, + "mean_token_accuracy": 0.4230978488922119, + "num_tokens": 4972219104.0, + "step": 9726 + }, + { + "epoch": 2.6303407247160626, + "grad_norm": 2.765625, + "learning_rate": 0.010679252383265848, + "loss": 3.0575, + "mean_token_accuracy": 0.41462740302085876, + "num_tokens": 4972743220.0, + "step": 9727 + }, + { + "epoch": 2.6306111411573823, + "grad_norm": 2.1875, + "learning_rate": 0.010677676900466818, + "loss": 3.0203, + "mean_token_accuracy": 0.40652352571487427, + "num_tokens": 4973267396.0, + "step": 9728 + }, + { + "epoch": 2.630881557598702, + "grad_norm": 2.484375, + "learning_rate": 0.010676101427557611, + "loss": 2.9578, + "mean_token_accuracy": 0.41306769847869873, + "num_tokens": 4973770077.0, + "step": 9729 + }, + { + "epoch": 2.6311519740400215, + "grad_norm": 2.578125, + "learning_rate": 0.010674525964586566, + "loss": 3.0243, + "mean_token_accuracy": 0.4149683713912964, + "num_tokens": 4974294362.0, + "step": 9730 + }, + { + "epoch": 2.631422390481341, + "grad_norm": 107.5, + "learning_rate": 0.010672950511602018, + "loss": 29.4477, + "mean_token_accuracy": 0.0, + "num_tokens": 4974818551.0, + "step": 9731 + }, + { + "epoch": 2.631692806922661, + "grad_norm": 6.375, + "learning_rate": 0.010671375068652313, + "loss": 3.3693, + "mean_token_accuracy": 0.35927385091781616, + "num_tokens": 4975342718.0, + "step": 9732 + }, + { + "epoch": 2.6319632233639805, + "grad_norm": 2.375, + "learning_rate": 0.010669799635785784, + "loss": 3.1936, + "mean_token_accuracy": 0.4527477025985718, + "num_tokens": 4975742910.0, + "step": 9733 + }, + { + "epoch": 2.6322336398053, + "grad_norm": 2.640625, + "learning_rate": 0.010668224213050774, + "loss": 3.0927, + "mean_token_accuracy": 0.4251493215560913, + "num_tokens": 4976181573.0, + "step": 9734 + }, + { + "epoch": 2.6325040562466198, + "grad_norm": 2.515625, + "learning_rate": 0.010666648800495624, + "loss": 3.0449, + "mean_token_accuracy": 0.407450795173645, + "num_tokens": 4976705828.0, + "step": 9735 + }, + { + "epoch": 2.6327744726879394, + "grad_norm": 3.15625, + "learning_rate": 0.01066507339816867, + "loss": 2.9011, + "mean_token_accuracy": 0.45917069911956787, + "num_tokens": 4977173802.0, + "step": 9736 + }, + { + "epoch": 2.633044889129259, + "grad_norm": 3.484375, + "learning_rate": 0.010663498006118243, + "loss": 3.0304, + "mean_token_accuracy": 0.421327143907547, + "num_tokens": 4977641137.0, + "step": 9737 + }, + { + "epoch": 2.6333153055705787, + "grad_norm": 3.484375, + "learning_rate": 0.010661922624392693, + "loss": 3.1157, + "mean_token_accuracy": 0.40426722168922424, + "num_tokens": 4978157252.0, + "step": 9738 + }, + { + "epoch": 2.6335857220118983, + "grad_norm": 3.515625, + "learning_rate": 0.010660347253040348, + "loss": 3.1327, + "mean_token_accuracy": 0.412824809551239, + "num_tokens": 4978681517.0, + "step": 9739 + }, + { + "epoch": 2.633856138453218, + "grad_norm": 3.21875, + "learning_rate": 0.010658771892109547, + "loss": 3.1689, + "mean_token_accuracy": 0.40057292580604553, + "num_tokens": 4979182096.0, + "step": 9740 + }, + { + "epoch": 2.6341265548945376, + "grad_norm": 3.453125, + "learning_rate": 0.010657196541648628, + "loss": 3.4064, + "mean_token_accuracy": 0.39802253246307373, + "num_tokens": 4979658193.0, + "step": 9741 + }, + { + "epoch": 2.6343969713358573, + "grad_norm": 4.34375, + "learning_rate": 0.010655621201705932, + "loss": 3.2745, + "mean_token_accuracy": 0.4086083769798279, + "num_tokens": 4980118842.0, + "step": 9742 + }, + { + "epoch": 2.634667387777177, + "grad_norm": 2.828125, + "learning_rate": 0.010654045872329782, + "loss": 3.1006, + "mean_token_accuracy": 0.42177385091781616, + "num_tokens": 4980582907.0, + "step": 9743 + }, + { + "epoch": 2.6349378042184965, + "grad_norm": 3.046875, + "learning_rate": 0.01065247055356853, + "loss": 3.2501, + "mean_token_accuracy": 0.3989875912666321, + "num_tokens": 4981107099.0, + "step": 9744 + }, + { + "epoch": 2.635208220659816, + "grad_norm": 2.828125, + "learning_rate": 0.010650895245470497, + "loss": 3.3215, + "mean_token_accuracy": 0.3933674693107605, + "num_tokens": 4981631247.0, + "step": 9745 + }, + { + "epoch": 2.635478637101136, + "grad_norm": 2.65625, + "learning_rate": 0.01064931994808403, + "loss": 2.9413, + "mean_token_accuracy": 0.4080612063407898, + "num_tokens": 4982155336.0, + "step": 9746 + }, + { + "epoch": 2.6357490535424555, + "grad_norm": 2.71875, + "learning_rate": 0.010647744661457457, + "loss": 3.0632, + "mean_token_accuracy": 0.39636629819869995, + "num_tokens": 4982679608.0, + "step": 9747 + }, + { + "epoch": 2.636019469983775, + "grad_norm": 2.828125, + "learning_rate": 0.010646169385639111, + "loss": 3.2364, + "mean_token_accuracy": 0.3786514103412628, + "num_tokens": 4983203881.0, + "step": 9748 + }, + { + "epoch": 2.6362898864250948, + "grad_norm": 3.046875, + "learning_rate": 0.010644594120677331, + "loss": 3.1279, + "mean_token_accuracy": 0.4329487979412079, + "num_tokens": 4983663377.0, + "step": 9749 + }, + { + "epoch": 2.6365603028664144, + "grad_norm": 3.578125, + "learning_rate": 0.010643018866620451, + "loss": 2.9378, + "mean_token_accuracy": 0.4026094675064087, + "num_tokens": 4984187564.0, + "step": 9750 + }, + { + "epoch": 2.636830719307734, + "grad_norm": 41.5, + "learning_rate": 0.010641443623516796, + "loss": 10.4955, + "mean_token_accuracy": 0.009516868740320206, + "num_tokens": 4984710706.0, + "step": 9751 + }, + { + "epoch": 2.6371011357490537, + "grad_norm": 6.75, + "learning_rate": 0.010639868391414708, + "loss": 3.4243, + "mean_token_accuracy": 0.3662131428718567, + "num_tokens": 4985234960.0, + "step": 9752 + }, + { + "epoch": 2.6373715521903733, + "grad_norm": 1.9375, + "learning_rate": 0.01063829317036252, + "loss": 3.2997, + "mean_token_accuracy": 0.38730576634407043, + "num_tokens": 4985759239.0, + "step": 9753 + }, + { + "epoch": 2.6376419686316925, + "grad_norm": 2.84375, + "learning_rate": 0.010636717960408555, + "loss": 3.1558, + "mean_token_accuracy": 0.38471293449401855, + "num_tokens": 4986283491.0, + "step": 9754 + }, + { + "epoch": 2.6379123850730126, + "grad_norm": 3.515625, + "learning_rate": 0.010635142761601154, + "loss": 3.2419, + "mean_token_accuracy": 0.39688801765441895, + "num_tokens": 4986807677.0, + "step": 9755 + }, + { + "epoch": 2.638182801514332, + "grad_norm": 3.75, + "learning_rate": 0.010633567573988647, + "loss": 3.1772, + "mean_token_accuracy": 0.4154212176799774, + "num_tokens": 4987312408.0, + "step": 9756 + }, + { + "epoch": 2.638453217955652, + "grad_norm": 2.546875, + "learning_rate": 0.010631992397619363, + "loss": 2.9514, + "mean_token_accuracy": 0.4286271333694458, + "num_tokens": 4987773493.0, + "step": 9757 + }, + { + "epoch": 2.638723634396971, + "grad_norm": 2.453125, + "learning_rate": 0.010630417232541638, + "loss": 3.1318, + "mean_token_accuracy": 0.3839317560195923, + "num_tokens": 4988297710.0, + "step": 9758 + }, + { + "epoch": 2.638994050838291, + "grad_norm": 2.6875, + "learning_rate": 0.010628842078803798, + "loss": 3.1663, + "mean_token_accuracy": 0.40217360854148865, + "num_tokens": 4988821771.0, + "step": 9759 + }, + { + "epoch": 2.6392644672796104, + "grad_norm": 2.734375, + "learning_rate": 0.01062726693645417, + "loss": 3.17, + "mean_token_accuracy": 0.4118782579898834, + "num_tokens": 4989346031.0, + "step": 9760 + }, + { + "epoch": 2.6395348837209305, + "grad_norm": 3.484375, + "learning_rate": 0.010625691805541094, + "loss": 3.0351, + "mean_token_accuracy": 0.396018385887146, + "num_tokens": 4989870243.0, + "step": 9761 + }, + { + "epoch": 2.6398053001622497, + "grad_norm": 2.640625, + "learning_rate": 0.010624116686112892, + "loss": 3.1047, + "mean_token_accuracy": 0.4062551259994507, + "num_tokens": 4990375087.0, + "step": 9762 + }, + { + "epoch": 2.6400757166035698, + "grad_norm": 3.734375, + "learning_rate": 0.010622541578217893, + "loss": 2.9288, + "mean_token_accuracy": 0.4010789394378662, + "num_tokens": 4990884528.0, + "step": 9763 + }, + { + "epoch": 2.640346133044889, + "grad_norm": 3.28125, + "learning_rate": 0.010620966481904432, + "loss": 2.9825, + "mean_token_accuracy": 0.40065595507621765, + "num_tokens": 4991408786.0, + "step": 9764 + }, + { + "epoch": 2.6406165494862086, + "grad_norm": 2.890625, + "learning_rate": 0.010619391397220835, + "loss": 3.0753, + "mean_token_accuracy": 0.416067510843277, + "num_tokens": 4991911778.0, + "step": 9765 + }, + { + "epoch": 2.6408869659275283, + "grad_norm": 3.25, + "learning_rate": 0.010617816324215424, + "loss": 2.9856, + "mean_token_accuracy": 0.41986432671546936, + "num_tokens": 4992435892.0, + "step": 9766 + }, + { + "epoch": 2.641157382368848, + "grad_norm": 3.203125, + "learning_rate": 0.010616241262936537, + "loss": 3.0058, + "mean_token_accuracy": 0.398876428604126, + "num_tokens": 4992960023.0, + "step": 9767 + }, + { + "epoch": 2.6414277988101675, + "grad_norm": 2.921875, + "learning_rate": 0.010614666213432494, + "loss": 2.9031, + "mean_token_accuracy": 0.4090667963027954, + "num_tokens": 4993484096.0, + "step": 9768 + }, + { + "epoch": 2.641698215251487, + "grad_norm": 4.5625, + "learning_rate": 0.010613091175751623, + "loss": 3.1619, + "mean_token_accuracy": 0.38927945494651794, + "num_tokens": 4994008296.0, + "step": 9769 + }, + { + "epoch": 2.641968631692807, + "grad_norm": 1.9453125, + "learning_rate": 0.010611516149942258, + "loss": 3.0385, + "mean_token_accuracy": 0.42050814628601074, + "num_tokens": 4994524431.0, + "step": 9770 + }, + { + "epoch": 2.6422390481341265, + "grad_norm": 394.0, + "learning_rate": 0.010609941136052714, + "loss": 26.8586, + "mean_token_accuracy": 0.012732982635498047, + "num_tokens": 4995047543.0, + "step": 9771 + }, + { + "epoch": 2.642509464575446, + "grad_norm": 8.0, + "learning_rate": 0.010608366134131329, + "loss": 3.5465, + "mean_token_accuracy": 0.3410904109477997, + "num_tokens": 4995571730.0, + "step": 9772 + }, + { + "epoch": 2.6427798810167658, + "grad_norm": 2.421875, + "learning_rate": 0.01060679114422642, + "loss": 3.1853, + "mean_token_accuracy": 0.39358413219451904, + "num_tokens": 4996057409.0, + "step": 9773 + }, + { + "epoch": 2.6430502974580854, + "grad_norm": 2.375, + "learning_rate": 0.010605216166386313, + "loss": 3.3408, + "mean_token_accuracy": 0.38972610235214233, + "num_tokens": 4996534552.0, + "step": 9774 + }, + { + "epoch": 2.643320713899405, + "grad_norm": 3.140625, + "learning_rate": 0.010603641200659337, + "loss": 2.9817, + "mean_token_accuracy": 0.3776661157608032, + "num_tokens": 4997058759.0, + "step": 9775 + }, + { + "epoch": 2.6435911303407247, + "grad_norm": 3.671875, + "learning_rate": 0.010602066247093816, + "loss": 3.1141, + "mean_token_accuracy": 0.4176192581653595, + "num_tokens": 4997582819.0, + "step": 9776 + }, + { + "epoch": 2.6438615467820443, + "grad_norm": 3.84375, + "learning_rate": 0.010600491305738067, + "loss": 3.2426, + "mean_token_accuracy": 0.4060296416282654, + "num_tokens": 4998106944.0, + "step": 9777 + }, + { + "epoch": 2.644131963223364, + "grad_norm": 3.40625, + "learning_rate": 0.010598916376640425, + "loss": 2.9811, + "mean_token_accuracy": 0.40863853693008423, + "num_tokens": 4998631210.0, + "step": 9778 + }, + { + "epoch": 2.6444023796646836, + "grad_norm": 2.71875, + "learning_rate": 0.01059734145984921, + "loss": 2.9602, + "mean_token_accuracy": 0.4213070273399353, + "num_tokens": 4999155482.0, + "step": 9779 + }, + { + "epoch": 2.6446727961060033, + "grad_norm": 2.828125, + "learning_rate": 0.010595766555412738, + "loss": 2.9126, + "mean_token_accuracy": 0.41724902391433716, + "num_tokens": 4999679763.0, + "step": 9780 + }, + { + "epoch": 2.644943212547323, + "grad_norm": 2.5625, + "learning_rate": 0.01059419166337934, + "loss": 3.083, + "mean_token_accuracy": 0.39360928535461426, + "num_tokens": 5000185744.0, + "step": 9781 + }, + { + "epoch": 2.6452136289886425, + "grad_norm": 2.875, + "learning_rate": 0.010592616783797335, + "loss": 3.2498, + "mean_token_accuracy": 0.39992350339889526, + "num_tokens": 5000658045.0, + "step": 9782 + }, + { + "epoch": 2.645484045429962, + "grad_norm": 3.0, + "learning_rate": 0.010591041916715043, + "loss": 3.1258, + "mean_token_accuracy": 0.3882054090499878, + "num_tokens": 5001182303.0, + "step": 9783 + }, + { + "epoch": 2.645754461871282, + "grad_norm": 2.796875, + "learning_rate": 0.010589467062180788, + "loss": 3.1893, + "mean_token_accuracy": 0.4017910659313202, + "num_tokens": 5001706530.0, + "step": 9784 + }, + { + "epoch": 2.6460248783126015, + "grad_norm": 2.734375, + "learning_rate": 0.010587892220242893, + "loss": 3.1402, + "mean_token_accuracy": 0.38233986496925354, + "num_tokens": 5002198367.0, + "step": 9785 + }, + { + "epoch": 2.646295294753921, + "grad_norm": 2.546875, + "learning_rate": 0.010586317390949673, + "loss": 2.9491, + "mean_token_accuracy": 0.40402787923812866, + "num_tokens": 5002722418.0, + "step": 9786 + }, + { + "epoch": 2.6465657111952408, + "grad_norm": 2.359375, + "learning_rate": 0.010584742574349458, + "loss": 3.1687, + "mean_token_accuracy": 0.40346893668174744, + "num_tokens": 5003246579.0, + "step": 9787 + }, + { + "epoch": 2.6468361276365604, + "grad_norm": 2.734375, + "learning_rate": 0.010583167770490558, + "loss": 3.1003, + "mean_token_accuracy": 0.4210526943206787, + "num_tokens": 5003732142.0, + "step": 9788 + }, + { + "epoch": 2.64710654407788, + "grad_norm": 2.203125, + "learning_rate": 0.010581592979421295, + "loss": 3.0343, + "mean_token_accuracy": 0.4234253764152527, + "num_tokens": 5004206430.0, + "step": 9789 + }, + { + "epoch": 2.6473769605191997, + "grad_norm": 2.59375, + "learning_rate": 0.010580018201189992, + "loss": 3.0316, + "mean_token_accuracy": 0.41361355781555176, + "num_tokens": 5004730696.0, + "step": 9790 + }, + { + "epoch": 2.6476473769605193, + "grad_norm": 2.0625, + "learning_rate": 0.010578443435844964, + "loss": 11.0688, + "mean_token_accuracy": 1.532965325168334e-05, + "num_tokens": 5005254976.0, + "step": 9791 + }, + { + "epoch": 2.647917793401839, + "grad_norm": 5.125, + "learning_rate": 0.010576868683434533, + "loss": 3.2695, + "mean_token_accuracy": 0.3817821741104126, + "num_tokens": 5005779192.0, + "step": 9792 + }, + { + "epoch": 2.6481882098431586, + "grad_norm": 2.15625, + "learning_rate": 0.010575293944007016, + "loss": 2.8534, + "mean_token_accuracy": 0.4362971782684326, + "num_tokens": 5006241688.0, + "step": 9793 + }, + { + "epoch": 2.6484586262844783, + "grad_norm": 2.921875, + "learning_rate": 0.010573719217610727, + "loss": 3.1955, + "mean_token_accuracy": 0.3994903266429901, + "num_tokens": 5006765804.0, + "step": 9794 + }, + { + "epoch": 2.6487290427257975, + "grad_norm": 4.3125, + "learning_rate": 0.010572144504293987, + "loss": 3.3321, + "mean_token_accuracy": 0.40059715509414673, + "num_tokens": 5007290040.0, + "step": 9795 + }, + { + "epoch": 2.6489994591671175, + "grad_norm": 3.53125, + "learning_rate": 0.010570569804105114, + "loss": 3.314, + "mean_token_accuracy": 0.38112872838974, + "num_tokens": 5007749923.0, + "step": 9796 + }, + { + "epoch": 2.6492698756084367, + "grad_norm": 2.96875, + "learning_rate": 0.010568995117092419, + "loss": 3.1405, + "mean_token_accuracy": 0.40143078565597534, + "num_tokens": 5008274111.0, + "step": 9797 + }, + { + "epoch": 2.649540292049757, + "grad_norm": 3.3125, + "learning_rate": 0.010567420443304222, + "loss": 2.9844, + "mean_token_accuracy": 0.431018590927124, + "num_tokens": 5008787096.0, + "step": 9798 + }, + { + "epoch": 2.649810708491076, + "grad_norm": 2.90625, + "learning_rate": 0.010565845782788841, + "loss": 3.3297, + "mean_token_accuracy": 0.38424623012542725, + "num_tokens": 5009311318.0, + "step": 9799 + }, + { + "epoch": 2.650081124932396, + "grad_norm": 3.671875, + "learning_rate": 0.010564271135594583, + "loss": 3.1317, + "mean_token_accuracy": 0.39270853996276855, + "num_tokens": 5009835562.0, + "step": 9800 + }, + { + "epoch": 2.6503515413737153, + "grad_norm": 2.84375, + "learning_rate": 0.010562696501769773, + "loss": 3.0288, + "mean_token_accuracy": 0.42195960879325867, + "num_tokens": 5010325160.0, + "step": 9801 + }, + { + "epoch": 2.6506219578150354, + "grad_norm": 2.9375, + "learning_rate": 0.01056112188136272, + "loss": 2.8139, + "mean_token_accuracy": 0.4135590195655823, + "num_tokens": 5010813642.0, + "step": 9802 + }, + { + "epoch": 2.6508923742563546, + "grad_norm": 2.328125, + "learning_rate": 0.010559547274421734, + "loss": 3.029, + "mean_token_accuracy": 0.4227970242500305, + "num_tokens": 5011314377.0, + "step": 9803 + }, + { + "epoch": 2.6511627906976747, + "grad_norm": 3.078125, + "learning_rate": 0.010557972680995136, + "loss": 2.9802, + "mean_token_accuracy": 0.39914828538894653, + "num_tokens": 5011838587.0, + "step": 9804 + }, + { + "epoch": 2.651433207138994, + "grad_norm": 2.9375, + "learning_rate": 0.010556398101131237, + "loss": 2.8084, + "mean_token_accuracy": 0.4354613721370697, + "num_tokens": 5012309945.0, + "step": 9805 + }, + { + "epoch": 2.6517036235803135, + "grad_norm": 3.5, + "learning_rate": 0.010554823534878346, + "loss": 2.8825, + "mean_token_accuracy": 0.409311980009079, + "num_tokens": 5012833968.0, + "step": 9806 + }, + { + "epoch": 2.651974040021633, + "grad_norm": 3.296875, + "learning_rate": 0.010553248982284782, + "loss": 3.0054, + "mean_token_accuracy": 0.41312628984451294, + "num_tokens": 5013358110.0, + "step": 9807 + }, + { + "epoch": 2.652244456462953, + "grad_norm": 4.15625, + "learning_rate": 0.010551674443398851, + "loss": 3.2304, + "mean_token_accuracy": 0.3936893939971924, + "num_tokens": 5013882389.0, + "step": 9808 + }, + { + "epoch": 2.6525148729042725, + "grad_norm": 3.140625, + "learning_rate": 0.010550099918268862, + "loss": 3.259, + "mean_token_accuracy": 0.3900620639324188, + "num_tokens": 5014351580.0, + "step": 9809 + }, + { + "epoch": 2.652785289345592, + "grad_norm": 3.015625, + "learning_rate": 0.010548525406943138, + "loss": 3.118, + "mean_token_accuracy": 0.3871946334838867, + "num_tokens": 5014875775.0, + "step": 9810 + }, + { + "epoch": 2.6530557057869117, + "grad_norm": 10.8125, + "learning_rate": 0.01054695090946998, + "loss": 14.3391, + "mean_token_accuracy": 0.010655292309820652, + "num_tokens": 5015400034.0, + "step": 9811 + }, + { + "epoch": 2.6533261222282314, + "grad_norm": 5.0625, + "learning_rate": 0.010545376425897694, + "loss": 3.3844, + "mean_token_accuracy": 0.366953581571579, + "num_tokens": 5015924261.0, + "step": 9812 + }, + { + "epoch": 2.653596538669551, + "grad_norm": 1.9140625, + "learning_rate": 0.010543801956274605, + "loss": 2.9387, + "mean_token_accuracy": 0.41942480206489563, + "num_tokens": 5016407666.0, + "step": 9813 + }, + { + "epoch": 2.6538669551108707, + "grad_norm": 2.65625, + "learning_rate": 0.010542227500649008, + "loss": 2.9815, + "mean_token_accuracy": 0.39915385842323303, + "num_tokens": 5016931938.0, + "step": 9814 + }, + { + "epoch": 2.6541373715521903, + "grad_norm": 2.6875, + "learning_rate": 0.01054065305906922, + "loss": 3.2363, + "mean_token_accuracy": 0.4074733257293701, + "num_tokens": 5017425266.0, + "step": 9815 + }, + { + "epoch": 2.65440778799351, + "grad_norm": 2.9375, + "learning_rate": 0.01053907863158355, + "loss": 3.0342, + "mean_token_accuracy": 0.3989701271057129, + "num_tokens": 5017949515.0, + "step": 9816 + }, + { + "epoch": 2.6546782044348296, + "grad_norm": 3.03125, + "learning_rate": 0.010537504218240297, + "loss": 2.9887, + "mean_token_accuracy": 0.40913671255111694, + "num_tokens": 5018415366.0, + "step": 9817 + }, + { + "epoch": 2.6549486208761492, + "grad_norm": 2.828125, + "learning_rate": 0.01053592981908778, + "loss": 3.0106, + "mean_token_accuracy": 0.41328758001327515, + "num_tokens": 5018939605.0, + "step": 9818 + }, + { + "epoch": 2.655219037317469, + "grad_norm": 2.828125, + "learning_rate": 0.010534355434174296, + "loss": 2.9712, + "mean_token_accuracy": 0.39841514825820923, + "num_tokens": 5019463849.0, + "step": 9819 + }, + { + "epoch": 2.6554894537587885, + "grad_norm": 2.234375, + "learning_rate": 0.010532781063548158, + "loss": 3.0016, + "mean_token_accuracy": 0.4274744391441345, + "num_tokens": 5019987978.0, + "step": 9820 + }, + { + "epoch": 2.655759870200108, + "grad_norm": 2.59375, + "learning_rate": 0.010531206707257672, + "loss": 2.8542, + "mean_token_accuracy": 0.40333133935928345, + "num_tokens": 5020512082.0, + "step": 9821 + }, + { + "epoch": 2.656030286641428, + "grad_norm": 2.4375, + "learning_rate": 0.010529632365351146, + "loss": 2.89, + "mean_token_accuracy": 0.432037353515625, + "num_tokens": 5020988691.0, + "step": 9822 + }, + { + "epoch": 2.6563007030827475, + "grad_norm": 3.015625, + "learning_rate": 0.010528058037876875, + "loss": 3.1435, + "mean_token_accuracy": 0.4117489755153656, + "num_tokens": 5021512979.0, + "step": 9823 + }, + { + "epoch": 2.656571119524067, + "grad_norm": 3.046875, + "learning_rate": 0.010526483724883177, + "loss": 3.0933, + "mean_token_accuracy": 0.37365254759788513, + "num_tokens": 5022037140.0, + "step": 9824 + }, + { + "epoch": 2.6568415359653867, + "grad_norm": 2.859375, + "learning_rate": 0.01052490942641835, + "loss": 3.0377, + "mean_token_accuracy": 0.4148898720741272, + "num_tokens": 5022529202.0, + "step": 9825 + }, + { + "epoch": 2.6571119524067064, + "grad_norm": 3.15625, + "learning_rate": 0.010523335142530697, + "loss": 3.2622, + "mean_token_accuracy": 0.40828603506088257, + "num_tokens": 5023053471.0, + "step": 9826 + }, + { + "epoch": 2.657382368848026, + "grad_norm": 3.0, + "learning_rate": 0.010521760873268525, + "loss": 3.1635, + "mean_token_accuracy": 0.39139220118522644, + "num_tokens": 5023567270.0, + "step": 9827 + }, + { + "epoch": 2.6576527852893457, + "grad_norm": 2.8125, + "learning_rate": 0.010520186618680136, + "loss": 2.9741, + "mean_token_accuracy": 0.4168652296066284, + "num_tokens": 5024091473.0, + "step": 9828 + }, + { + "epoch": 2.6579232017306653, + "grad_norm": 3.0625, + "learning_rate": 0.010518612378813833, + "loss": 3.0172, + "mean_token_accuracy": 0.3974551558494568, + "num_tokens": 5024615735.0, + "step": 9829 + }, + { + "epoch": 2.658193618171985, + "grad_norm": 2.828125, + "learning_rate": 0.010517038153717918, + "loss": 2.9358, + "mean_token_accuracy": 0.4238365888595581, + "num_tokens": 5025139988.0, + "step": 9830 + }, + { + "epoch": 2.6584640346133046, + "grad_norm": 4.75, + "learning_rate": 0.010515463943440691, + "loss": 10.3831, + "mean_token_accuracy": 6.641678919550031e-05, + "num_tokens": 5025664261.0, + "step": 9831 + }, + { + "epoch": 2.6587344510546242, + "grad_norm": 10.375, + "learning_rate": 0.010513889748030454, + "loss": 3.4087, + "mean_token_accuracy": 0.37527573108673096, + "num_tokens": 5026188427.0, + "step": 9832 + }, + { + "epoch": 2.659004867495944, + "grad_norm": 2.546875, + "learning_rate": 0.010512315567535511, + "loss": 3.0616, + "mean_token_accuracy": 0.4089392125606537, + "num_tokens": 5026674923.0, + "step": 9833 + }, + { + "epoch": 2.6592752839372635, + "grad_norm": 2.015625, + "learning_rate": 0.010510741402004163, + "loss": 3.1332, + "mean_token_accuracy": 0.4040186405181885, + "num_tokens": 5027199128.0, + "step": 9834 + }, + { + "epoch": 2.659545700378583, + "grad_norm": 2.796875, + "learning_rate": 0.010509167251484701, + "loss": 3.0208, + "mean_token_accuracy": 0.38729119300842285, + "num_tokens": 5027670013.0, + "step": 9835 + }, + { + "epoch": 2.6598161168199024, + "grad_norm": 2.75, + "learning_rate": 0.010507593116025438, + "loss": 3.174, + "mean_token_accuracy": 0.403150737285614, + "num_tokens": 5028194286.0, + "step": 9836 + }, + { + "epoch": 2.6600865332612225, + "grad_norm": 2.984375, + "learning_rate": 0.010506018995674661, + "loss": 2.9182, + "mean_token_accuracy": 0.3976197838783264, + "num_tokens": 5028718551.0, + "step": 9837 + }, + { + "epoch": 2.6603569497025417, + "grad_norm": 3.0, + "learning_rate": 0.010504444890480677, + "loss": 3.0103, + "mean_token_accuracy": 0.39407771825790405, + "num_tokens": 5029242651.0, + "step": 9838 + }, + { + "epoch": 2.6606273661438617, + "grad_norm": 2.921875, + "learning_rate": 0.010502870800491781, + "loss": 2.9357, + "mean_token_accuracy": 0.4170280396938324, + "num_tokens": 5029766932.0, + "step": 9839 + }, + { + "epoch": 2.660897782585181, + "grad_norm": 3.265625, + "learning_rate": 0.010501296725756271, + "loss": 3.2227, + "mean_token_accuracy": 0.39257490634918213, + "num_tokens": 5030231011.0, + "step": 9840 + }, + { + "epoch": 2.661168199026501, + "grad_norm": 3.71875, + "learning_rate": 0.010499722666322445, + "loss": 3.3451, + "mean_token_accuracy": 0.39093828201293945, + "num_tokens": 5030755296.0, + "step": 9841 + }, + { + "epoch": 2.6614386154678202, + "grad_norm": 2.9375, + "learning_rate": 0.010498148622238596, + "loss": 2.7967, + "mean_token_accuracy": 0.3870609700679779, + "num_tokens": 5031219225.0, + "step": 9842 + }, + { + "epoch": 2.6617090319091403, + "grad_norm": 3.109375, + "learning_rate": 0.010496574593553023, + "loss": 2.8978, + "mean_token_accuracy": 0.42819300293922424, + "num_tokens": 5031743496.0, + "step": 9843 + }, + { + "epoch": 2.6619794483504595, + "grad_norm": 2.71875, + "learning_rate": 0.010495000580314025, + "loss": 3.2328, + "mean_token_accuracy": 0.400790810585022, + "num_tokens": 5032219519.0, + "step": 9844 + }, + { + "epoch": 2.6622498647917796, + "grad_norm": 3.0, + "learning_rate": 0.010493426582569894, + "loss": 3.0255, + "mean_token_accuracy": 0.4232521951198578, + "num_tokens": 5032743707.0, + "step": 9845 + }, + { + "epoch": 2.662520281233099, + "grad_norm": 2.890625, + "learning_rate": 0.010491852600368922, + "loss": 3.2504, + "mean_token_accuracy": 0.3972751498222351, + "num_tokens": 5033267986.0, + "step": 9846 + }, + { + "epoch": 2.6627906976744184, + "grad_norm": 3.390625, + "learning_rate": 0.010490278633759407, + "loss": 2.9549, + "mean_token_accuracy": 0.41425785422325134, + "num_tokens": 5033792145.0, + "step": 9847 + }, + { + "epoch": 2.663061114115738, + "grad_norm": 2.140625, + "learning_rate": 0.010488704682789647, + "loss": 2.9349, + "mean_token_accuracy": 0.42573851346969604, + "num_tokens": 5034278724.0, + "step": 9848 + }, + { + "epoch": 2.6633315305570577, + "grad_norm": 3.171875, + "learning_rate": 0.010487130747507924, + "loss": 2.9889, + "mean_token_accuracy": 0.4055195152759552, + "num_tokens": 5034802950.0, + "step": 9849 + }, + { + "epoch": 2.6636019469983774, + "grad_norm": 2.375, + "learning_rate": 0.010485556827962544, + "loss": 3.0388, + "mean_token_accuracy": 0.42026418447494507, + "num_tokens": 5035327080.0, + "step": 9850 + }, + { + "epoch": 2.663872363439697, + "grad_norm": 204.0, + "learning_rate": 0.010483982924201794, + "loss": 16.1311, + "mean_token_accuracy": 0.0003890415246132761, + "num_tokens": 5035851345.0, + "step": 9851 + }, + { + "epoch": 2.6641427798810167, + "grad_norm": 6.84375, + "learning_rate": 0.01048240903627396, + "loss": 3.2746, + "mean_token_accuracy": 0.38033461570739746, + "num_tokens": 5036375488.0, + "step": 9852 + }, + { + "epoch": 2.6644131963223363, + "grad_norm": 2.5625, + "learning_rate": 0.010480835164227343, + "loss": 3.2625, + "mean_token_accuracy": 0.38502952456474304, + "num_tokens": 5036898558.0, + "step": 9853 + }, + { + "epoch": 2.664683612763656, + "grad_norm": 2.5, + "learning_rate": 0.010479261308110228, + "loss": 2.9362, + "mean_token_accuracy": 0.4000590443611145, + "num_tokens": 5037422796.0, + "step": 9854 + }, + { + "epoch": 2.6649540292049756, + "grad_norm": 2.640625, + "learning_rate": 0.010477687467970905, + "loss": 3.0635, + "mean_token_accuracy": 0.405232310295105, + "num_tokens": 5037896053.0, + "step": 9855 + }, + { + "epoch": 2.6652244456462952, + "grad_norm": 3.3125, + "learning_rate": 0.010476113643857672, + "loss": 3.1649, + "mean_token_accuracy": 0.39535337686538696, + "num_tokens": 5038375797.0, + "step": 9856 + }, + { + "epoch": 2.665494862087615, + "grad_norm": 3.65625, + "learning_rate": 0.01047453983581881, + "loss": 3.096, + "mean_token_accuracy": 0.40242549777030945, + "num_tokens": 5038899990.0, + "step": 9857 + }, + { + "epoch": 2.6657652785289345, + "grad_norm": 3.0625, + "learning_rate": 0.01047296604390261, + "loss": 3.034, + "mean_token_accuracy": 0.40480712056159973, + "num_tokens": 5039424197.0, + "step": 9858 + }, + { + "epoch": 2.666035694970254, + "grad_norm": 3.171875, + "learning_rate": 0.010471392268157366, + "loss": 3.1049, + "mean_token_accuracy": 0.39699405431747437, + "num_tokens": 5039948425.0, + "step": 9859 + }, + { + "epoch": 2.666306111411574, + "grad_norm": 3.0625, + "learning_rate": 0.010469818508631356, + "loss": 3.0987, + "mean_token_accuracy": 0.4226469099521637, + "num_tokens": 5040419234.0, + "step": 9860 + }, + { + "epoch": 2.6665765278528935, + "grad_norm": 3.015625, + "learning_rate": 0.010468244765372876, + "loss": 3.0335, + "mean_token_accuracy": 0.3995642066001892, + "num_tokens": 5040943422.0, + "step": 9861 + }, + { + "epoch": 2.666846944294213, + "grad_norm": 2.671875, + "learning_rate": 0.01046667103843021, + "loss": 3.0642, + "mean_token_accuracy": 0.418597936630249, + "num_tokens": 5041467586.0, + "step": 9862 + }, + { + "epoch": 2.6671173607355327, + "grad_norm": 2.703125, + "learning_rate": 0.010465097327851645, + "loss": 3.0893, + "mean_token_accuracy": 0.40008270740509033, + "num_tokens": 5041981428.0, + "step": 9863 + }, + { + "epoch": 2.6673877771768524, + "grad_norm": 2.6875, + "learning_rate": 0.01046352363368547, + "loss": 2.874, + "mean_token_accuracy": 0.4047503173351288, + "num_tokens": 5042505578.0, + "step": 9864 + }, + { + "epoch": 2.667658193618172, + "grad_norm": 2.34375, + "learning_rate": 0.010461949955979965, + "loss": 3.082, + "mean_token_accuracy": 0.41467195749282837, + "num_tokens": 5043029820.0, + "step": 9865 + }, + { + "epoch": 2.6679286100594917, + "grad_norm": 3.125, + "learning_rate": 0.010460376294783419, + "loss": 3.0843, + "mean_token_accuracy": 0.3755512833595276, + "num_tokens": 5043554055.0, + "step": 9866 + }, + { + "epoch": 2.6681990265008113, + "grad_norm": 2.5625, + "learning_rate": 0.010458802650144115, + "loss": 3.0763, + "mean_token_accuracy": 0.39298710227012634, + "num_tokens": 5044078283.0, + "step": 9867 + }, + { + "epoch": 2.668469442942131, + "grad_norm": 3.640625, + "learning_rate": 0.010457229022110335, + "loss": 3.1385, + "mean_token_accuracy": 0.42488840222358704, + "num_tokens": 5044602506.0, + "step": 9868 + }, + { + "epoch": 2.6687398593834506, + "grad_norm": 2.8125, + "learning_rate": 0.010455655410730362, + "loss": 2.8091, + "mean_token_accuracy": 0.40874183177948, + "num_tokens": 5045126480.0, + "step": 9869 + }, + { + "epoch": 2.6690102758247702, + "grad_norm": 2.734375, + "learning_rate": 0.01045408181605249, + "loss": 3.0906, + "mean_token_accuracy": 0.39250487089157104, + "num_tokens": 5045650700.0, + "step": 9870 + }, + { + "epoch": 2.66928069226609, + "grad_norm": 5.8125, + "learning_rate": 0.01045250823812499, + "loss": 10.4757, + "mean_token_accuracy": 0.0026281471364200115, + "num_tokens": 5046169335.0, + "step": 9871 + }, + { + "epoch": 2.6695511087074095, + "grad_norm": 7.09375, + "learning_rate": 0.010450934676996145, + "loss": 3.3658, + "mean_token_accuracy": 0.36176496744155884, + "num_tokens": 5046690314.0, + "step": 9872 + }, + { + "epoch": 2.669821525148729, + "grad_norm": 2.296875, + "learning_rate": 0.010449361132714241, + "loss": 3.1709, + "mean_token_accuracy": 0.39023977518081665, + "num_tokens": 5047214496.0, + "step": 9873 + }, + { + "epoch": 2.670091941590049, + "grad_norm": 3.140625, + "learning_rate": 0.010447787605327557, + "loss": 3.0505, + "mean_token_accuracy": 0.41779929399490356, + "num_tokens": 5047698041.0, + "step": 9874 + }, + { + "epoch": 2.6703623580313685, + "grad_norm": 3.3125, + "learning_rate": 0.010446214094884372, + "loss": 2.9923, + "mean_token_accuracy": 0.4085858464241028, + "num_tokens": 5048154055.0, + "step": 9875 + }, + { + "epoch": 2.670632774472688, + "grad_norm": 2.859375, + "learning_rate": 0.010444640601432966, + "loss": 3.2634, + "mean_token_accuracy": 0.38815101981163025, + "num_tokens": 5048678244.0, + "step": 9876 + }, + { + "epoch": 2.6709031909140073, + "grad_norm": 3.046875, + "learning_rate": 0.010443067125021624, + "loss": 3.1717, + "mean_token_accuracy": 0.38929253816604614, + "num_tokens": 5049202425.0, + "step": 9877 + }, + { + "epoch": 2.6711736073553274, + "grad_norm": 3.5625, + "learning_rate": 0.010441493665698614, + "loss": 3.1557, + "mean_token_accuracy": 0.3950144052505493, + "num_tokens": 5049703017.0, + "step": 9878 + }, + { + "epoch": 2.6714440237966466, + "grad_norm": 3.21875, + "learning_rate": 0.010439920223512226, + "loss": 3.2719, + "mean_token_accuracy": 0.39871495962142944, + "num_tokens": 5050227162.0, + "step": 9879 + }, + { + "epoch": 2.6717144402379667, + "grad_norm": 4.0625, + "learning_rate": 0.010438346798510734, + "loss": 3.1236, + "mean_token_accuracy": 0.39855167269706726, + "num_tokens": 5050751266.0, + "step": 9880 + }, + { + "epoch": 2.671984856679286, + "grad_norm": 2.8125, + "learning_rate": 0.010436773390742407, + "loss": 3.1774, + "mean_token_accuracy": 0.4045095145702362, + "num_tokens": 5051275543.0, + "step": 9881 + }, + { + "epoch": 2.672255273120606, + "grad_norm": 2.765625, + "learning_rate": 0.010435200000255534, + "loss": 2.9901, + "mean_token_accuracy": 0.4053056240081787, + "num_tokens": 5051799650.0, + "step": 9882 + }, + { + "epoch": 2.672525689561925, + "grad_norm": 2.296875, + "learning_rate": 0.010433626627098383, + "loss": 2.8861, + "mean_token_accuracy": 0.43369948863983154, + "num_tokens": 5052323918.0, + "step": 9883 + }, + { + "epoch": 2.6727961060032452, + "grad_norm": 3.171875, + "learning_rate": 0.010432053271319235, + "loss": 2.9841, + "mean_token_accuracy": 0.4203351140022278, + "num_tokens": 5052847934.0, + "step": 9884 + }, + { + "epoch": 2.6730665224445644, + "grad_norm": 2.84375, + "learning_rate": 0.010430479932966363, + "loss": 2.9713, + "mean_token_accuracy": 0.4109429121017456, + "num_tokens": 5053371895.0, + "step": 9885 + }, + { + "epoch": 2.6733369388858845, + "grad_norm": 2.78125, + "learning_rate": 0.010428906612088038, + "loss": 2.9526, + "mean_token_accuracy": 0.4203169345855713, + "num_tokens": 5053896170.0, + "step": 9886 + }, + { + "epoch": 2.6736073553272037, + "grad_norm": 3.3125, + "learning_rate": 0.010427333308732543, + "loss": 3.2926, + "mean_token_accuracy": 0.37802451848983765, + "num_tokens": 5054420443.0, + "step": 9887 + }, + { + "epoch": 2.6738777717685234, + "grad_norm": 2.8125, + "learning_rate": 0.010425760022948143, + "loss": 2.9467, + "mean_token_accuracy": 0.42694562673568726, + "num_tokens": 5054944654.0, + "step": 9888 + }, + { + "epoch": 2.674148188209843, + "grad_norm": 2.921875, + "learning_rate": 0.010424186754783113, + "loss": 2.8481, + "mean_token_accuracy": 0.4274289608001709, + "num_tokens": 5055468800.0, + "step": 9889 + }, + { + "epoch": 2.6744186046511627, + "grad_norm": 2.765625, + "learning_rate": 0.01042261350428573, + "loss": 3.2165, + "mean_token_accuracy": 0.39897042512893677, + "num_tokens": 5055993073.0, + "step": 9890 + }, + { + "epoch": 2.6746890210924823, + "grad_norm": 83.0, + "learning_rate": 0.010421040271504261, + "loss": 32.4066, + "mean_token_accuracy": 0.0, + "num_tokens": 5056517178.0, + "step": 9891 + }, + { + "epoch": 2.674959437533802, + "grad_norm": 7.0625, + "learning_rate": 0.010419467056486978, + "loss": 3.4648, + "mean_token_accuracy": 0.3698914349079132, + "num_tokens": 5057041401.0, + "step": 9892 + }, + { + "epoch": 2.6752298539751216, + "grad_norm": 2.453125, + "learning_rate": 0.010417893859282155, + "loss": 3.0951, + "mean_token_accuracy": 0.39122992753982544, + "num_tokens": 5057565645.0, + "step": 9893 + }, + { + "epoch": 2.6755002704164412, + "grad_norm": 3.484375, + "learning_rate": 0.01041632067993806, + "loss": 3.1378, + "mean_token_accuracy": 0.4008176922798157, + "num_tokens": 5058078566.0, + "step": 9894 + }, + { + "epoch": 2.675770686857761, + "grad_norm": 3.625, + "learning_rate": 0.010414747518502961, + "loss": 3.3509, + "mean_token_accuracy": 0.3993089199066162, + "num_tokens": 5058602728.0, + "step": 9895 + }, + { + "epoch": 2.6760411032990805, + "grad_norm": 2.875, + "learning_rate": 0.010413174375025131, + "loss": 3.0456, + "mean_token_accuracy": 0.4047752022743225, + "num_tokens": 5059126996.0, + "step": 9896 + }, + { + "epoch": 2.6763115197404, + "grad_norm": 3.203125, + "learning_rate": 0.010411601249552838, + "loss": 3.0017, + "mean_token_accuracy": 0.4027585983276367, + "num_tokens": 5059651145.0, + "step": 9897 + }, + { + "epoch": 2.67658193618172, + "grad_norm": 2.609375, + "learning_rate": 0.010410028142134344, + "loss": 3.2233, + "mean_token_accuracy": 0.38648027181625366, + "num_tokens": 5060175377.0, + "step": 9898 + }, + { + "epoch": 2.6768523526230394, + "grad_norm": 2.828125, + "learning_rate": 0.010408455052817926, + "loss": 3.1395, + "mean_token_accuracy": 0.40845608711242676, + "num_tokens": 5060699507.0, + "step": 9899 + }, + { + "epoch": 2.677122769064359, + "grad_norm": 3.21875, + "learning_rate": 0.010406881981651848, + "loss": 3.0444, + "mean_token_accuracy": 0.3836754262447357, + "num_tokens": 5061223782.0, + "step": 9900 + }, + { + "epoch": 2.6773931855056787, + "grad_norm": 3.078125, + "learning_rate": 0.010405308928684371, + "loss": 3.3864, + "mean_token_accuracy": 0.3698113262653351, + "num_tokens": 5061747987.0, + "step": 9901 + }, + { + "epoch": 2.6776636019469984, + "grad_norm": 3.453125, + "learning_rate": 0.010403735893963768, + "loss": 3.3163, + "mean_token_accuracy": 0.35347914695739746, + "num_tokens": 5062272264.0, + "step": 9902 + }, + { + "epoch": 2.677934018388318, + "grad_norm": 2.765625, + "learning_rate": 0.010402162877538302, + "loss": 3.2164, + "mean_token_accuracy": 0.40502113103866577, + "num_tokens": 5062742103.0, + "step": 9903 + }, + { + "epoch": 2.6782044348296377, + "grad_norm": 3.34375, + "learning_rate": 0.01040058987945623, + "loss": 3.2584, + "mean_token_accuracy": 0.3877113461494446, + "num_tokens": 5063266382.0, + "step": 9904 + }, + { + "epoch": 2.6784748512709573, + "grad_norm": 2.234375, + "learning_rate": 0.010399016899765832, + "loss": 2.8177, + "mean_token_accuracy": 0.41247260570526123, + "num_tokens": 5063790588.0, + "step": 9905 + }, + { + "epoch": 2.678745267712277, + "grad_norm": 3.359375, + "learning_rate": 0.010397443938515357, + "loss": 3.1576, + "mean_token_accuracy": 0.37263864278793335, + "num_tokens": 5064314861.0, + "step": 9906 + }, + { + "epoch": 2.6790156841535966, + "grad_norm": 2.875, + "learning_rate": 0.010395870995753077, + "loss": 2.9523, + "mean_token_accuracy": 0.3903461992740631, + "num_tokens": 5064839138.0, + "step": 9907 + }, + { + "epoch": 2.6792861005949162, + "grad_norm": 2.71875, + "learning_rate": 0.010394298071527253, + "loss": 3.0769, + "mean_token_accuracy": 0.40527620911598206, + "num_tokens": 5065342806.0, + "step": 9908 + }, + { + "epoch": 2.679556517036236, + "grad_norm": 3.359375, + "learning_rate": 0.010392725165886138, + "loss": 3.1831, + "mean_token_accuracy": 0.39415496587753296, + "num_tokens": 5065866985.0, + "step": 9909 + }, + { + "epoch": 2.6798269334775555, + "grad_norm": 3.5, + "learning_rate": 0.010391152278878008, + "loss": 3.111, + "mean_token_accuracy": 0.3931278586387634, + "num_tokens": 5066352638.0, + "step": 9910 + }, + { + "epoch": 2.680097349918875, + "grad_norm": 44.0, + "learning_rate": 0.010389579410551115, + "loss": 20.7597, + "mean_token_accuracy": 0.042236652225255966, + "num_tokens": 5066876794.0, + "step": 9911 + }, + { + "epoch": 2.680367766360195, + "grad_norm": 6.09375, + "learning_rate": 0.010388006560953717, + "loss": 3.4636, + "mean_token_accuracy": 0.40316450595855713, + "num_tokens": 5067334965.0, + "step": 9912 + }, + { + "epoch": 2.6806381828015144, + "grad_norm": 1.953125, + "learning_rate": 0.010386433730134084, + "loss": 3.0078, + "mean_token_accuracy": 0.41075843572616577, + "num_tokens": 5067859104.0, + "step": 9913 + }, + { + "epoch": 2.680908599242834, + "grad_norm": 2.4375, + "learning_rate": 0.010384860918140466, + "loss": 3.068, + "mean_token_accuracy": 0.3886778652667999, + "num_tokens": 5068383268.0, + "step": 9914 + }, + { + "epoch": 2.6811790156841537, + "grad_norm": 2.78125, + "learning_rate": 0.010383288125021121, + "loss": 3.0685, + "mean_token_accuracy": 0.43696218729019165, + "num_tokens": 5068875994.0, + "step": 9915 + }, + { + "epoch": 2.6814494321254734, + "grad_norm": 2.40625, + "learning_rate": 0.010381715350824314, + "loss": 3.0247, + "mean_token_accuracy": 0.42277494072914124, + "num_tokens": 5069336360.0, + "step": 9916 + }, + { + "epoch": 2.681719848566793, + "grad_norm": 3.296875, + "learning_rate": 0.010380142595598297, + "loss": 3.0003, + "mean_token_accuracy": 0.40556755661964417, + "num_tokens": 5069860477.0, + "step": 9917 + }, + { + "epoch": 2.681990265008112, + "grad_norm": 2.640625, + "learning_rate": 0.010378569859391326, + "loss": 2.9328, + "mean_token_accuracy": 0.4332163333892822, + "num_tokens": 5070319497.0, + "step": 9918 + }, + { + "epoch": 2.6822606814494323, + "grad_norm": 3.40625, + "learning_rate": 0.01037699714225166, + "loss": 3.0244, + "mean_token_accuracy": 0.41208600997924805, + "num_tokens": 5070843547.0, + "step": 9919 + }, + { + "epoch": 2.6825310978907515, + "grad_norm": 2.578125, + "learning_rate": 0.010375424444227557, + "loss": 3.1427, + "mean_token_accuracy": 0.39114904403686523, + "num_tokens": 5071367744.0, + "step": 9920 + }, + { + "epoch": 2.6828015143320716, + "grad_norm": 2.8125, + "learning_rate": 0.010373851765367264, + "loss": 3.1049, + "mean_token_accuracy": 0.3934165835380554, + "num_tokens": 5071891988.0, + "step": 9921 + }, + { + "epoch": 2.683071930773391, + "grad_norm": 2.25, + "learning_rate": 0.010372279105719045, + "loss": 2.7349, + "mean_token_accuracy": 0.4313080906867981, + "num_tokens": 5072369681.0, + "step": 9922 + }, + { + "epoch": 2.683342347214711, + "grad_norm": 2.265625, + "learning_rate": 0.010370706465331147, + "loss": 3.1054, + "mean_token_accuracy": 0.40287908911705017, + "num_tokens": 5072893812.0, + "step": 9923 + }, + { + "epoch": 2.68361276365603, + "grad_norm": 2.484375, + "learning_rate": 0.010369133844251824, + "loss": 2.9715, + "mean_token_accuracy": 0.4129447042942047, + "num_tokens": 5073418018.0, + "step": 9924 + }, + { + "epoch": 2.68388318009735, + "grad_norm": 2.8125, + "learning_rate": 0.010367561242529329, + "loss": 2.9855, + "mean_token_accuracy": 0.4198983311653137, + "num_tokens": 5073942236.0, + "step": 9925 + }, + { + "epoch": 2.6841535965386694, + "grad_norm": 3.125, + "learning_rate": 0.01036598866021192, + "loss": 2.8011, + "mean_token_accuracy": 0.43588995933532715, + "num_tokens": 5074466506.0, + "step": 9926 + }, + { + "epoch": 2.6844240129799894, + "grad_norm": 2.484375, + "learning_rate": 0.010364416097347838, + "loss": 3.2055, + "mean_token_accuracy": 0.4046967625617981, + "num_tokens": 5074990744.0, + "step": 9927 + }, + { + "epoch": 2.6846944294213086, + "grad_norm": 3.203125, + "learning_rate": 0.010362843553985342, + "loss": 3.059, + "mean_token_accuracy": 0.4058248996734619, + "num_tokens": 5075514769.0, + "step": 9928 + }, + { + "epoch": 2.6849648458626283, + "grad_norm": 2.875, + "learning_rate": 0.010361271030172677, + "loss": 2.9097, + "mean_token_accuracy": 0.43252113461494446, + "num_tokens": 5075978693.0, + "step": 9929 + }, + { + "epoch": 2.685235262303948, + "grad_norm": 3.359375, + "learning_rate": 0.010359698525958099, + "loss": 3.0327, + "mean_token_accuracy": 0.4172070026397705, + "num_tokens": 5076502954.0, + "step": 9930 + }, + { + "epoch": 2.6855056787452676, + "grad_norm": 31.75, + "learning_rate": 0.010358126041389852, + "loss": 13.107, + "mean_token_accuracy": 0.0021928574424237013, + "num_tokens": 5077027155.0, + "step": 9931 + }, + { + "epoch": 2.685776095186587, + "grad_norm": 7.0, + "learning_rate": 0.010356553576516183, + "loss": 3.5252, + "mean_token_accuracy": 0.37748873233795166, + "num_tokens": 5077551330.0, + "step": 9932 + }, + { + "epoch": 2.686046511627907, + "grad_norm": 2.21875, + "learning_rate": 0.010354981131385346, + "loss": 2.9838, + "mean_token_accuracy": 0.4063172936439514, + "num_tokens": 5078075508.0, + "step": 9933 + }, + { + "epoch": 2.6863169280692265, + "grad_norm": 2.25, + "learning_rate": 0.010353408706045585, + "loss": 3.0638, + "mean_token_accuracy": 0.41196680068969727, + "num_tokens": 5078599740.0, + "step": 9934 + }, + { + "epoch": 2.686587344510546, + "grad_norm": 3.171875, + "learning_rate": 0.01035183630054514, + "loss": 3.0775, + "mean_token_accuracy": 0.40649551153182983, + "num_tokens": 5079082099.0, + "step": 9935 + }, + { + "epoch": 2.686857760951866, + "grad_norm": 2.390625, + "learning_rate": 0.010350263914932272, + "loss": 3.1181, + "mean_token_accuracy": 0.39699479937553406, + "num_tokens": 5079606275.0, + "step": 9936 + }, + { + "epoch": 2.6871281773931854, + "grad_norm": 3.171875, + "learning_rate": 0.010348691549255216, + "loss": 2.8501, + "mean_token_accuracy": 0.4419791102409363, + "num_tokens": 5080130367.0, + "step": 9937 + }, + { + "epoch": 2.687398593834505, + "grad_norm": 64.5, + "learning_rate": 0.010347119203562214, + "loss": 4.1207, + "mean_token_accuracy": 0.3638099730014801, + "num_tokens": 5080599544.0, + "step": 9938 + }, + { + "epoch": 2.6876690102758247, + "grad_norm": 5.0625, + "learning_rate": 0.010345546877901518, + "loss": 3.3467, + "mean_token_accuracy": 0.3814544081687927, + "num_tokens": 5081121358.0, + "step": 9939 + }, + { + "epoch": 2.6879394267171444, + "grad_norm": 2.59375, + "learning_rate": 0.010343974572321368, + "loss": 2.9472, + "mean_token_accuracy": 0.41818347573280334, + "num_tokens": 5081645565.0, + "step": 9940 + }, + { + "epoch": 2.688209843158464, + "grad_norm": 3.671875, + "learning_rate": 0.010342402286870006, + "loss": 3.2246, + "mean_token_accuracy": 0.38739991188049316, + "num_tokens": 5082169844.0, + "step": 9941 + }, + { + "epoch": 2.6884802595997837, + "grad_norm": 3.484375, + "learning_rate": 0.010340830021595677, + "loss": 3.259, + "mean_token_accuracy": 0.39079731702804565, + "num_tokens": 5082694119.0, + "step": 9942 + }, + { + "epoch": 2.6887506760411033, + "grad_norm": 2.375, + "learning_rate": 0.010339257776546625, + "loss": 3.021, + "mean_token_accuracy": 0.4065049886703491, + "num_tokens": 5083218252.0, + "step": 9943 + }, + { + "epoch": 2.689021092482423, + "grad_norm": 2.84375, + "learning_rate": 0.010337685551771081, + "loss": 3.226, + "mean_token_accuracy": 0.3895059823989868, + "num_tokens": 5083742403.0, + "step": 9944 + }, + { + "epoch": 2.6892915089237426, + "grad_norm": 3.859375, + "learning_rate": 0.010336113347317296, + "loss": 2.9007, + "mean_token_accuracy": 0.44849690794944763, + "num_tokens": 5084266686.0, + "step": 9945 + }, + { + "epoch": 2.6895619253650622, + "grad_norm": 2.5, + "learning_rate": 0.010334541163233504, + "loss": 2.8565, + "mean_token_accuracy": 0.43954724073410034, + "num_tokens": 5084728414.0, + "step": 9946 + }, + { + "epoch": 2.689832341806382, + "grad_norm": 3.0625, + "learning_rate": 0.010332968999567947, + "loss": 3.0381, + "mean_token_accuracy": 0.41642725467681885, + "num_tokens": 5085246776.0, + "step": 9947 + }, + { + "epoch": 2.6901027582477015, + "grad_norm": 2.71875, + "learning_rate": 0.010331396856368863, + "loss": 2.9622, + "mean_token_accuracy": 0.4054752588272095, + "num_tokens": 5085770961.0, + "step": 9948 + }, + { + "epoch": 2.690373174689021, + "grad_norm": 2.5625, + "learning_rate": 0.010329824733684491, + "loss": 3.0283, + "mean_token_accuracy": 0.41284608840942383, + "num_tokens": 5086295205.0, + "step": 9949 + }, + { + "epoch": 2.690643591130341, + "grad_norm": 2.71875, + "learning_rate": 0.010328252631563062, + "loss": 2.6042, + "mean_token_accuracy": 0.43334609270095825, + "num_tokens": 5086819252.0, + "step": 9950 + }, + { + "epoch": 2.6909140075716604, + "grad_norm": 15.1875, + "learning_rate": 0.010326680550052822, + "loss": 10.0004, + "mean_token_accuracy": 0.02735765650868416, + "num_tokens": 5087281081.0, + "step": 9951 + }, + { + "epoch": 2.69118442401298, + "grad_norm": 5.46875, + "learning_rate": 0.010325108489202001, + "loss": 3.5291, + "mean_token_accuracy": 0.38724005222320557, + "num_tokens": 5087743436.0, + "step": 9952 + }, + { + "epoch": 2.6914548404542997, + "grad_norm": 12.3125, + "learning_rate": 0.010323536449058838, + "loss": 3.3889, + "mean_token_accuracy": 0.39565056562423706, + "num_tokens": 5088252604.0, + "step": 9953 + }, + { + "epoch": 2.6917252568956194, + "grad_norm": 2.796875, + "learning_rate": 0.010321964429671567, + "loss": 3.3265, + "mean_token_accuracy": 0.3858502209186554, + "num_tokens": 5088734518.0, + "step": 9954 + }, + { + "epoch": 2.691995673336939, + "grad_norm": 2.625, + "learning_rate": 0.010320392431088416, + "loss": 3.3257, + "mean_token_accuracy": 0.3902187943458557, + "num_tokens": 5089258778.0, + "step": 9955 + }, + { + "epoch": 2.6922660897782587, + "grad_norm": 3.328125, + "learning_rate": 0.010318820453357629, + "loss": 2.9025, + "mean_token_accuracy": 0.40819406509399414, + "num_tokens": 5089782870.0, + "step": 9956 + }, + { + "epoch": 2.6925365062195783, + "grad_norm": 2.53125, + "learning_rate": 0.010317248496527433, + "loss": 2.993, + "mean_token_accuracy": 0.41812455654144287, + "num_tokens": 5090219628.0, + "step": 9957 + }, + { + "epoch": 2.692806922660898, + "grad_norm": 3.34375, + "learning_rate": 0.010315676560646058, + "loss": 3.1536, + "mean_token_accuracy": 0.4000929892063141, + "num_tokens": 5090743768.0, + "step": 9958 + }, + { + "epoch": 2.693077339102217, + "grad_norm": 2.515625, + "learning_rate": 0.010314104645761744, + "loss": 3.2352, + "mean_token_accuracy": 0.4074597954750061, + "num_tokens": 5091267830.0, + "step": 9959 + }, + { + "epoch": 2.6933477555435372, + "grad_norm": 2.96875, + "learning_rate": 0.010312532751922713, + "loss": 3.1722, + "mean_token_accuracy": 0.3922491669654846, + "num_tokens": 5091792070.0, + "step": 9960 + }, + { + "epoch": 2.6936181719848564, + "grad_norm": 3.296875, + "learning_rate": 0.010310960879177197, + "loss": 3.2939, + "mean_token_accuracy": 0.4012190103530884, + "num_tokens": 5092311645.0, + "step": 9961 + }, + { + "epoch": 2.6938885884261765, + "grad_norm": 4.21875, + "learning_rate": 0.01030938902757343, + "loss": 3.3612, + "mean_token_accuracy": 0.3902028203010559, + "num_tokens": 5092835865.0, + "step": 9962 + }, + { + "epoch": 2.6941590048674957, + "grad_norm": 2.40625, + "learning_rate": 0.010307817197159641, + "loss": 3.1725, + "mean_token_accuracy": 0.400450199842453, + "num_tokens": 5093323553.0, + "step": 9963 + }, + { + "epoch": 2.694429421308816, + "grad_norm": 3.71875, + "learning_rate": 0.01030624538798405, + "loss": 3.091, + "mean_token_accuracy": 0.3988126218318939, + "num_tokens": 5093847720.0, + "step": 9964 + }, + { + "epoch": 2.694699837750135, + "grad_norm": 3.609375, + "learning_rate": 0.010304673600094896, + "loss": 3.103, + "mean_token_accuracy": 0.42492184042930603, + "num_tokens": 5094294972.0, + "step": 9965 + }, + { + "epoch": 2.694970254191455, + "grad_norm": 2.765625, + "learning_rate": 0.0103031018335404, + "loss": 3.0949, + "mean_token_accuracy": 0.39859771728515625, + "num_tokens": 5094819175.0, + "step": 9966 + }, + { + "epoch": 2.6952406706327743, + "grad_norm": 2.734375, + "learning_rate": 0.010301530088368784, + "loss": 2.9167, + "mean_token_accuracy": 0.4186957776546478, + "num_tokens": 5095294794.0, + "step": 9967 + }, + { + "epoch": 2.6955110870740944, + "grad_norm": 2.859375, + "learning_rate": 0.010299958364628284, + "loss": 3.076, + "mean_token_accuracy": 0.39583927392959595, + "num_tokens": 5095819065.0, + "step": 9968 + }, + { + "epoch": 2.6957815035154136, + "grad_norm": 3.1875, + "learning_rate": 0.010298386662367121, + "loss": 3.0472, + "mean_token_accuracy": 0.3990169167518616, + "num_tokens": 5096343299.0, + "step": 9969 + }, + { + "epoch": 2.696051919956733, + "grad_norm": 2.859375, + "learning_rate": 0.010296814981633514, + "loss": 2.9825, + "mean_token_accuracy": 0.4260959029197693, + "num_tokens": 5096867451.0, + "step": 9970 + }, + { + "epoch": 2.696322336398053, + "grad_norm": 76.0, + "learning_rate": 0.010295243322475693, + "loss": 11.2542, + "mean_token_accuracy": 0.0008868930162861943, + "num_tokens": 5097391653.0, + "step": 9971 + }, + { + "epoch": 2.6965927528393725, + "grad_norm": 6.625, + "learning_rate": 0.010293671684941883, + "loss": 3.5215, + "mean_token_accuracy": 0.3418845534324646, + "num_tokens": 5097915918.0, + "step": 9972 + }, + { + "epoch": 2.696863169280692, + "grad_norm": 2.53125, + "learning_rate": 0.010292100069080297, + "loss": 3.1724, + "mean_token_accuracy": 0.39696377515792847, + "num_tokens": 5098440162.0, + "step": 9973 + }, + { + "epoch": 2.697133585722012, + "grad_norm": 3.421875, + "learning_rate": 0.010290528474939166, + "loss": 2.6371, + "mean_token_accuracy": 0.4416441321372986, + "num_tokens": 5098964398.0, + "step": 9974 + }, + { + "epoch": 2.6974040021633314, + "grad_norm": 2.609375, + "learning_rate": 0.010288956902566709, + "loss": 2.9509, + "mean_token_accuracy": 0.41987258195877075, + "num_tokens": 5099435832.0, + "step": 9975 + }, + { + "epoch": 2.697674418604651, + "grad_norm": 3.40625, + "learning_rate": 0.01028738535201114, + "loss": 2.9982, + "mean_token_accuracy": 0.38901790976524353, + "num_tokens": 5099959953.0, + "step": 9976 + }, + { + "epoch": 2.6979448350459707, + "grad_norm": 2.734375, + "learning_rate": 0.01028581382332069, + "loss": 3.0965, + "mean_token_accuracy": 0.4149268865585327, + "num_tokens": 5100445454.0, + "step": 9977 + }, + { + "epoch": 2.6982152514872904, + "grad_norm": 2.71875, + "learning_rate": 0.010284242316543568, + "loss": 2.9812, + "mean_token_accuracy": 0.3973594009876251, + "num_tokens": 5100969692.0, + "step": 9978 + }, + { + "epoch": 2.69848566792861, + "grad_norm": 2.453125, + "learning_rate": 0.010282670831728, + "loss": 3.0178, + "mean_token_accuracy": 0.42804184556007385, + "num_tokens": 5101493808.0, + "step": 9979 + }, + { + "epoch": 2.6987560843699296, + "grad_norm": 2.9375, + "learning_rate": 0.010281099368922202, + "loss": 2.9253, + "mean_token_accuracy": 0.418875515460968, + "num_tokens": 5101959456.0, + "step": 9980 + }, + { + "epoch": 2.6990265008112493, + "grad_norm": 2.5, + "learning_rate": 0.010279527928174383, + "loss": 3.1091, + "mean_token_accuracy": 0.3998929262161255, + "num_tokens": 5102483711.0, + "step": 9981 + }, + { + "epoch": 2.699296917252569, + "grad_norm": 3.5, + "learning_rate": 0.010277956509532771, + "loss": 3.0606, + "mean_token_accuracy": 0.41242581605911255, + "num_tokens": 5103007842.0, + "step": 9982 + }, + { + "epoch": 2.6995673336938886, + "grad_norm": 2.921875, + "learning_rate": 0.010276385113045577, + "loss": 3.1013, + "mean_token_accuracy": 0.4044795036315918, + "num_tokens": 5103531999.0, + "step": 9983 + }, + { + "epoch": 2.699837750135208, + "grad_norm": 3.578125, + "learning_rate": 0.010274813738761011, + "loss": 3.0188, + "mean_token_accuracy": 0.40449610352516174, + "num_tokens": 5104056218.0, + "step": 9984 + }, + { + "epoch": 2.700108166576528, + "grad_norm": 2.890625, + "learning_rate": 0.010273242386727298, + "loss": 3.0498, + "mean_token_accuracy": 0.4173273742198944, + "num_tokens": 5104580347.0, + "step": 9985 + }, + { + "epoch": 2.7003785830178475, + "grad_norm": 2.90625, + "learning_rate": 0.010271671056992641, + "loss": 2.8794, + "mean_token_accuracy": 0.41326749324798584, + "num_tokens": 5105104519.0, + "step": 9986 + }, + { + "epoch": 2.700648999459167, + "grad_norm": 38.25, + "learning_rate": 0.01027009974960526, + "loss": 2.9686, + "mean_token_accuracy": 0.44393807649612427, + "num_tokens": 5105628663.0, + "step": 9987 + }, + { + "epoch": 2.700919415900487, + "grad_norm": 6.0625, + "learning_rate": 0.010268528464613364, + "loss": 2.9729, + "mean_token_accuracy": 0.37956470251083374, + "num_tokens": 5106152776.0, + "step": 9988 + }, + { + "epoch": 2.7011898323418064, + "grad_norm": 2.234375, + "learning_rate": 0.010266957202065169, + "loss": 2.9325, + "mean_token_accuracy": 0.43431466817855835, + "num_tokens": 5106673312.0, + "step": 9989 + }, + { + "epoch": 2.701460248783126, + "grad_norm": 3.21875, + "learning_rate": 0.010265385962008877, + "loss": 3.1505, + "mean_token_accuracy": 0.39939188957214355, + "num_tokens": 5107197500.0, + "step": 9990 + }, + { + "epoch": 2.7017306652244457, + "grad_norm": 0.83984375, + "learning_rate": 0.01026381474449271, + "loss": 11.1107, + "mean_token_accuracy": 0.0, + "num_tokens": 5107721582.0, + "step": 9991 + }, + { + "epoch": 2.7020010816657654, + "grad_norm": 8.0625, + "learning_rate": 0.010262243549564872, + "loss": 3.3663, + "mean_token_accuracy": 0.38573959469795227, + "num_tokens": 5108245778.0, + "step": 9992 + }, + { + "epoch": 2.702271498107085, + "grad_norm": 2.0, + "learning_rate": 0.010260672377273563, + "loss": 3.1493, + "mean_token_accuracy": 0.39493051171302795, + "num_tokens": 5108718104.0, + "step": 9993 + }, + { + "epoch": 2.7025419145484046, + "grad_norm": 2.234375, + "learning_rate": 0.01025910122766701, + "loss": 3.0776, + "mean_token_accuracy": 0.407778799533844, + "num_tokens": 5109242257.0, + "step": 9994 + }, + { + "epoch": 2.7028123309897243, + "grad_norm": 3.0, + "learning_rate": 0.010257530100793404, + "loss": 2.9839, + "mean_token_accuracy": 0.3971669673919678, + "num_tokens": 5109766442.0, + "step": 9995 + }, + { + "epoch": 2.703082747431044, + "grad_norm": 2.625, + "learning_rate": 0.010255958996700959, + "loss": 3.2142, + "mean_token_accuracy": 0.4046783745288849, + "num_tokens": 5110290644.0, + "step": 9996 + }, + { + "epoch": 2.7033531638723636, + "grad_norm": 3.671875, + "learning_rate": 0.01025438791543788, + "loss": 3.2095, + "mean_token_accuracy": 0.39712095260620117, + "num_tokens": 5110803054.0, + "step": 9997 + }, + { + "epoch": 2.703623580313683, + "grad_norm": 2.484375, + "learning_rate": 0.01025281685705237, + "loss": 3.104, + "mean_token_accuracy": 0.40801602602005005, + "num_tokens": 5111327240.0, + "step": 9998 + }, + { + "epoch": 2.703893996755003, + "grad_norm": 2.71875, + "learning_rate": 0.01025124582159264, + "loss": 3.0204, + "mean_token_accuracy": 0.40588024258613586, + "num_tokens": 5111851413.0, + "step": 9999 + }, + { + "epoch": 2.7041644131963225, + "grad_norm": 2.546875, + "learning_rate": 0.010249674809106893, + "loss": 3.1379, + "mean_token_accuracy": 0.4166242480278015, + "num_tokens": 5112375594.0, + "step": 10000 + }, + { + "epoch": 2.704434829637642, + "grad_norm": 3.046875, + "learning_rate": 0.010248103819643322, + "loss": 3.3081, + "mean_token_accuracy": 0.39464235305786133, + "num_tokens": 5112895420.0, + "step": 10001 + }, + { + "epoch": 2.7047052460789613, + "grad_norm": 2.71875, + "learning_rate": 0.010246532853250143, + "loss": 3.1781, + "mean_token_accuracy": 0.40065181255340576, + "num_tokens": 5113363907.0, + "step": 10002 + }, + { + "epoch": 2.7049756625202814, + "grad_norm": 3.15625, + "learning_rate": 0.01024496190997555, + "loss": 3.0436, + "mean_token_accuracy": 0.3912031054496765, + "num_tokens": 5113888032.0, + "step": 10003 + }, + { + "epoch": 2.7052460789616006, + "grad_norm": 3.078125, + "learning_rate": 0.010243390989867743, + "loss": 3.1489, + "mean_token_accuracy": 0.40236157178878784, + "num_tokens": 5114412276.0, + "step": 10004 + }, + { + "epoch": 2.7055164954029207, + "grad_norm": 2.859375, + "learning_rate": 0.01024182009297493, + "loss": 2.8673, + "mean_token_accuracy": 0.40755873918533325, + "num_tokens": 5114936444.0, + "step": 10005 + }, + { + "epoch": 2.70578691184424, + "grad_norm": 3.140625, + "learning_rate": 0.010240249219345306, + "loss": 2.9472, + "mean_token_accuracy": 0.4097486138343811, + "num_tokens": 5115460650.0, + "step": 10006 + }, + { + "epoch": 2.70605732828556, + "grad_norm": 2.5625, + "learning_rate": 0.010238678369027065, + "loss": 3.2144, + "mean_token_accuracy": 0.4020596146583557, + "num_tokens": 5115948210.0, + "step": 10007 + }, + { + "epoch": 2.706327744726879, + "grad_norm": 3.375, + "learning_rate": 0.010237107542068415, + "loss": 3.0317, + "mean_token_accuracy": 0.40014058351516724, + "num_tokens": 5116472463.0, + "step": 10008 + }, + { + "epoch": 2.7065981611681993, + "grad_norm": 2.734375, + "learning_rate": 0.010235536738517551, + "loss": 2.9657, + "mean_token_accuracy": 0.4044606685638428, + "num_tokens": 5116996660.0, + "step": 10009 + }, + { + "epoch": 2.7068685776095185, + "grad_norm": 3.28125, + "learning_rate": 0.010233965958422664, + "loss": 3.2566, + "mean_token_accuracy": 0.39968612790107727, + "num_tokens": 5117438248.0, + "step": 10010 + }, + { + "epoch": 2.707138994050838, + "grad_norm": 11.875, + "learning_rate": 0.010232395201831954, + "loss": 10.7429, + "mean_token_accuracy": 3.26246845361311e-05, + "num_tokens": 5117962321.0, + "step": 10011 + }, + { + "epoch": 2.7074094104921578, + "grad_norm": 6.03125, + "learning_rate": 0.01023082446879362, + "loss": 3.3614, + "mean_token_accuracy": 0.37675371766090393, + "num_tokens": 5118486540.0, + "step": 10012 + }, + { + "epoch": 2.7076798269334774, + "grad_norm": 3.203125, + "learning_rate": 0.010229253759355848, + "loss": 3.2684, + "mean_token_accuracy": 0.39847302436828613, + "num_tokens": 5118971554.0, + "step": 10013 + }, + { + "epoch": 2.707950243374797, + "grad_norm": 3.21875, + "learning_rate": 0.010227683073566843, + "loss": 3.2741, + "mean_token_accuracy": 0.3829507827758789, + "num_tokens": 5119473256.0, + "step": 10014 + }, + { + "epoch": 2.7082206598161167, + "grad_norm": 3.21875, + "learning_rate": 0.01022611241147479, + "loss": 3.1035, + "mean_token_accuracy": 0.3648841381072998, + "num_tokens": 5119997525.0, + "step": 10015 + }, + { + "epoch": 2.7084910762574363, + "grad_norm": 2.609375, + "learning_rate": 0.010224541773127883, + "loss": 2.7454, + "mean_token_accuracy": 0.4501138925552368, + "num_tokens": 5120521786.0, + "step": 10016 + }, + { + "epoch": 2.708761492698756, + "grad_norm": 3.046875, + "learning_rate": 0.010222971158574314, + "loss": 3.2333, + "mean_token_accuracy": 0.38250914216041565, + "num_tokens": 5121045981.0, + "step": 10017 + }, + { + "epoch": 2.7090319091400756, + "grad_norm": 3.65625, + "learning_rate": 0.01022140056786228, + "loss": 3.3286, + "mean_token_accuracy": 0.37109482288360596, + "num_tokens": 5121570171.0, + "step": 10018 + }, + { + "epoch": 2.7093023255813953, + "grad_norm": 2.65625, + "learning_rate": 0.010219830001039962, + "loss": 3.1331, + "mean_token_accuracy": 0.4107387065887451, + "num_tokens": 5122094436.0, + "step": 10019 + }, + { + "epoch": 2.709572742022715, + "grad_norm": 2.828125, + "learning_rate": 0.010218259458155557, + "loss": 2.9554, + "mean_token_accuracy": 0.41970446705818176, + "num_tokens": 5122559397.0, + "step": 10020 + }, + { + "epoch": 2.7098431584640346, + "grad_norm": 2.859375, + "learning_rate": 0.010216688939257246, + "loss": 2.8891, + "mean_token_accuracy": 0.3871084451675415, + "num_tokens": 5123083550.0, + "step": 10021 + }, + { + "epoch": 2.710113574905354, + "grad_norm": 2.546875, + "learning_rate": 0.010215118444393226, + "loss": 3.3653, + "mean_token_accuracy": 0.38327547907829285, + "num_tokens": 5123607722.0, + "step": 10022 + }, + { + "epoch": 2.710383991346674, + "grad_norm": 3.875, + "learning_rate": 0.01021354797361168, + "loss": 3.1868, + "mean_token_accuracy": 0.4226488471031189, + "num_tokens": 5124068622.0, + "step": 10023 + }, + { + "epoch": 2.7106544077879935, + "grad_norm": 2.5625, + "learning_rate": 0.010211977526960792, + "loss": 3.1883, + "mean_token_accuracy": 0.4019398093223572, + "num_tokens": 5124571658.0, + "step": 10024 + }, + { + "epoch": 2.710924824229313, + "grad_norm": 3.078125, + "learning_rate": 0.010210407104488752, + "loss": 3.202, + "mean_token_accuracy": 0.39529526233673096, + "num_tokens": 5125095939.0, + "step": 10025 + }, + { + "epoch": 2.7111952406706328, + "grad_norm": 3.4375, + "learning_rate": 0.010208836706243746, + "loss": 3.1138, + "mean_token_accuracy": 0.4014628827571869, + "num_tokens": 5125620066.0, + "step": 10026 + }, + { + "epoch": 2.7114656571119524, + "grad_norm": 2.8125, + "learning_rate": 0.010207266332273954, + "loss": 3.0599, + "mean_token_accuracy": 0.40406402945518494, + "num_tokens": 5126144328.0, + "step": 10027 + }, + { + "epoch": 2.711736073553272, + "grad_norm": 3.0, + "learning_rate": 0.010205695982627565, + "loss": 3.0302, + "mean_token_accuracy": 0.417775958776474, + "num_tokens": 5126645057.0, + "step": 10028 + }, + { + "epoch": 2.7120064899945917, + "grad_norm": 3.640625, + "learning_rate": 0.01020412565735276, + "loss": 3.0823, + "mean_token_accuracy": 0.4193251132965088, + "num_tokens": 5127169255.0, + "step": 10029 + }, + { + "epoch": 2.7122769064359114, + "grad_norm": 3.40625, + "learning_rate": 0.010202555356497715, + "loss": 3.2067, + "mean_token_accuracy": 0.3825805187225342, + "num_tokens": 5127693407.0, + "step": 10030 + }, + { + "epoch": 2.712547322877231, + "grad_norm": 1.0078125, + "learning_rate": 0.010200985080110622, + "loss": 11.2125, + "mean_token_accuracy": 3.008495923495502e-06, + "num_tokens": 5128155682.0, + "step": 10031 + }, + { + "epoch": 2.7128177393185506, + "grad_norm": 7.59375, + "learning_rate": 0.010199414828239654, + "loss": 3.2952, + "mean_token_accuracy": 0.3768075108528137, + "num_tokens": 5128679864.0, + "step": 10032 + }, + { + "epoch": 2.7130881557598703, + "grad_norm": 1.8671875, + "learning_rate": 0.010197844600932994, + "loss": 3.0454, + "mean_token_accuracy": 0.4193269908428192, + "num_tokens": 5129204125.0, + "step": 10033 + }, + { + "epoch": 2.71335857220119, + "grad_norm": 2.96875, + "learning_rate": 0.010196274398238822, + "loss": 3.1321, + "mean_token_accuracy": 0.3999272584915161, + "num_tokens": 5129728266.0, + "step": 10034 + }, + { + "epoch": 2.7136289886425096, + "grad_norm": 3.28125, + "learning_rate": 0.010194704220205315, + "loss": 2.971, + "mean_token_accuracy": 0.4219483733177185, + "num_tokens": 5130213158.0, + "step": 10035 + }, + { + "epoch": 2.713899405083829, + "grad_norm": 3.515625, + "learning_rate": 0.010193134066880646, + "loss": 3.1227, + "mean_token_accuracy": 0.3585997223854065, + "num_tokens": 5130737384.0, + "step": 10036 + }, + { + "epoch": 2.714169821525149, + "grad_norm": 3.046875, + "learning_rate": 0.010191563938313003, + "loss": 2.7906, + "mean_token_accuracy": 0.4162646532058716, + "num_tokens": 5131253571.0, + "step": 10037 + }, + { + "epoch": 2.7144402379664685, + "grad_norm": 2.765625, + "learning_rate": 0.010189993834550553, + "loss": 3.1491, + "mean_token_accuracy": 0.4016115665435791, + "num_tokens": 5131767567.0, + "step": 10038 + }, + { + "epoch": 2.714710654407788, + "grad_norm": 3.484375, + "learning_rate": 0.010188423755641472, + "loss": 3.2008, + "mean_token_accuracy": 0.3929346799850464, + "num_tokens": 5132291466.0, + "step": 10039 + }, + { + "epoch": 2.714981070849108, + "grad_norm": 3.015625, + "learning_rate": 0.01018685370163394, + "loss": 3.2158, + "mean_token_accuracy": 0.4098619222640991, + "num_tokens": 5132812138.0, + "step": 10040 + }, + { + "epoch": 2.7152514872904274, + "grad_norm": 2.828125, + "learning_rate": 0.010185283672576128, + "loss": 2.8748, + "mean_token_accuracy": 0.41704386472702026, + "num_tokens": 5133336324.0, + "step": 10041 + }, + { + "epoch": 2.715521903731747, + "grad_norm": 2.390625, + "learning_rate": 0.010183713668516206, + "loss": 3.2283, + "mean_token_accuracy": 0.41084909439086914, + "num_tokens": 5133833476.0, + "step": 10042 + }, + { + "epoch": 2.7157923201730663, + "grad_norm": 3.828125, + "learning_rate": 0.010182143689502352, + "loss": 3.0173, + "mean_token_accuracy": 0.4064140319824219, + "num_tokens": 5134357693.0, + "step": 10043 + }, + { + "epoch": 2.7160627366143864, + "grad_norm": 3.140625, + "learning_rate": 0.010180573735582732, + "loss": 3.1816, + "mean_token_accuracy": 0.39876532554626465, + "num_tokens": 5134881971.0, + "step": 10044 + }, + { + "epoch": 2.7163331530557056, + "grad_norm": 3.40625, + "learning_rate": 0.010179003806805522, + "loss": 2.912, + "mean_token_accuracy": 0.4445304870605469, + "num_tokens": 5135343874.0, + "step": 10045 + }, + { + "epoch": 2.7166035694970256, + "grad_norm": 2.828125, + "learning_rate": 0.010177433903218888, + "loss": 3.1099, + "mean_token_accuracy": 0.40613633394241333, + "num_tokens": 5135868139.0, + "step": 10046 + }, + { + "epoch": 2.716873985938345, + "grad_norm": 3.390625, + "learning_rate": 0.010175864024870999, + "loss": 3.0856, + "mean_token_accuracy": 0.40344277024269104, + "num_tokens": 5136353413.0, + "step": 10047 + }, + { + "epoch": 2.717144402379665, + "grad_norm": 3.09375, + "learning_rate": 0.01017429417181003, + "loss": 3.3123, + "mean_token_accuracy": 0.3500792980194092, + "num_tokens": 5136877592.0, + "step": 10048 + }, + { + "epoch": 2.717414818820984, + "grad_norm": 2.578125, + "learning_rate": 0.010172724344084142, + "loss": 2.9751, + "mean_token_accuracy": 0.4012094736099243, + "num_tokens": 5137401845.0, + "step": 10049 + }, + { + "epoch": 2.717685235262304, + "grad_norm": 2.65625, + "learning_rate": 0.010171154541741502, + "loss": 2.8692, + "mean_token_accuracy": 0.4400619864463806, + "num_tokens": 5137883960.0, + "step": 10050 + }, + { + "epoch": 2.7179556517036234, + "grad_norm": 226.0, + "learning_rate": 0.010169584764830285, + "loss": 19.369, + "mean_token_accuracy": 1.5782328773639165e-05, + "num_tokens": 5138378312.0, + "step": 10051 + }, + { + "epoch": 2.718226068144943, + "grad_norm": 10.625, + "learning_rate": 0.010168015013398644, + "loss": 3.6403, + "mean_token_accuracy": 0.3508792817592621, + "num_tokens": 5138867092.0, + "step": 10052 + }, + { + "epoch": 2.7184964845862627, + "grad_norm": 2.453125, + "learning_rate": 0.010166445287494748, + "loss": 3.072, + "mean_token_accuracy": 0.39579081535339355, + "num_tokens": 5139391363.0, + "step": 10053 + }, + { + "epoch": 2.7187669010275823, + "grad_norm": 2.46875, + "learning_rate": 0.010164875587166763, + "loss": 3.2977, + "mean_token_accuracy": 0.3715153932571411, + "num_tokens": 5139915473.0, + "step": 10054 + }, + { + "epoch": 2.719037317468902, + "grad_norm": 2.875, + "learning_rate": 0.010163305912462854, + "loss": 3.1571, + "mean_token_accuracy": 0.3967372179031372, + "num_tokens": 5140439696.0, + "step": 10055 + }, + { + "epoch": 2.7193077339102216, + "grad_norm": 3.15625, + "learning_rate": 0.010161736263431176, + "loss": 3.2604, + "mean_token_accuracy": 0.3895338773727417, + "num_tokens": 5140963937.0, + "step": 10056 + }, + { + "epoch": 2.7195781503515413, + "grad_norm": 3.140625, + "learning_rate": 0.010160166640119896, + "loss": 3.2231, + "mean_token_accuracy": 0.38901785016059875, + "num_tokens": 5141411053.0, + "step": 10057 + }, + { + "epoch": 2.719848566792861, + "grad_norm": 2.484375, + "learning_rate": 0.010158597042577176, + "loss": 3.0968, + "mean_token_accuracy": 0.40087080001831055, + "num_tokens": 5141905948.0, + "step": 10058 + }, + { + "epoch": 2.7201189832341806, + "grad_norm": 2.578125, + "learning_rate": 0.010157027470851168, + "loss": 3.1487, + "mean_token_accuracy": 0.40012747049331665, + "num_tokens": 5142413886.0, + "step": 10059 + }, + { + "epoch": 2.7203893996755, + "grad_norm": 2.296875, + "learning_rate": 0.010155457924990039, + "loss": 2.9089, + "mean_token_accuracy": 0.4263484477996826, + "num_tokens": 5142886367.0, + "step": 10060 + }, + { + "epoch": 2.72065981611682, + "grad_norm": 6.1875, + "learning_rate": 0.010153888405041946, + "loss": 2.9566, + "mean_token_accuracy": 0.4402592182159424, + "num_tokens": 5143410586.0, + "step": 10061 + }, + { + "epoch": 2.7209302325581395, + "grad_norm": 1.6484375, + "learning_rate": 0.010152318911055041, + "loss": 3.0484, + "mean_token_accuracy": 0.41550835967063904, + "num_tokens": 5143906027.0, + "step": 10062 + }, + { + "epoch": 2.721200648999459, + "grad_norm": 2.625, + "learning_rate": 0.01015074944307749, + "loss": 3.0783, + "mean_token_accuracy": 0.42829015851020813, + "num_tokens": 5144366407.0, + "step": 10063 + }, + { + "epoch": 2.7214710654407788, + "grad_norm": 2.828125, + "learning_rate": 0.010149180001157441, + "loss": 2.969, + "mean_token_accuracy": 0.41171374917030334, + "num_tokens": 5144888470.0, + "step": 10064 + }, + { + "epoch": 2.7217414818820984, + "grad_norm": 2.984375, + "learning_rate": 0.01014761058534305, + "loss": 3.037, + "mean_token_accuracy": 0.40437471866607666, + "num_tokens": 5145412733.0, + "step": 10065 + }, + { + "epoch": 2.722011898323418, + "grad_norm": 2.734375, + "learning_rate": 0.010146041195682477, + "loss": 3.027, + "mean_token_accuracy": 0.4224652647972107, + "num_tokens": 5145936997.0, + "step": 10066 + }, + { + "epoch": 2.7222823147647377, + "grad_norm": 3.5625, + "learning_rate": 0.010144471832223865, + "loss": 3.3165, + "mean_token_accuracy": 0.3995560109615326, + "num_tokens": 5146437196.0, + "step": 10067 + }, + { + "epoch": 2.7225527312060573, + "grad_norm": 2.75, + "learning_rate": 0.010142902495015376, + "loss": 3.0392, + "mean_token_accuracy": 0.4139711856842041, + "num_tokens": 5146915893.0, + "step": 10068 + }, + { + "epoch": 2.722823147647377, + "grad_norm": 2.46875, + "learning_rate": 0.010141333184105162, + "loss": 3.0321, + "mean_token_accuracy": 0.3777652978897095, + "num_tokens": 5147440119.0, + "step": 10069 + }, + { + "epoch": 2.7230935640886966, + "grad_norm": 2.625, + "learning_rate": 0.010139763899541366, + "loss": 3.0446, + "mean_token_accuracy": 0.39377397298812866, + "num_tokens": 5147964381.0, + "step": 10070 + }, + { + "epoch": 2.7233639805300163, + "grad_norm": 61.5, + "learning_rate": 0.01013819464137215, + "loss": 12.2641, + "mean_token_accuracy": 0.006332422140985727, + "num_tokens": 5148488491.0, + "step": 10071 + }, + { + "epoch": 2.723634396971336, + "grad_norm": 6.625, + "learning_rate": 0.010136625409645652, + "loss": 3.5206, + "mean_token_accuracy": 0.3668292760848999, + "num_tokens": 5149012716.0, + "step": 10072 + }, + { + "epoch": 2.7239048134126556, + "grad_norm": 2.359375, + "learning_rate": 0.010135056204410025, + "loss": 3.0211, + "mean_token_accuracy": 0.40228334069252014, + "num_tokens": 5149536887.0, + "step": 10073 + }, + { + "epoch": 2.724175229853975, + "grad_norm": 2.875, + "learning_rate": 0.01013348702571342, + "loss": 3.3725, + "mean_token_accuracy": 0.3755002021789551, + "num_tokens": 5150061146.0, + "step": 10074 + }, + { + "epoch": 2.724445646295295, + "grad_norm": 3.03125, + "learning_rate": 0.01013191787360398, + "loss": 3.1571, + "mean_token_accuracy": 0.3664937913417816, + "num_tokens": 5150585298.0, + "step": 10075 + }, + { + "epoch": 2.7247160627366145, + "grad_norm": 2.4375, + "learning_rate": 0.010130348748129852, + "loss": 2.9954, + "mean_token_accuracy": 0.39363592863082886, + "num_tokens": 5151109460.0, + "step": 10076 + }, + { + "epoch": 2.724986479177934, + "grad_norm": 3.421875, + "learning_rate": 0.010128779649339184, + "loss": 3.3407, + "mean_token_accuracy": 0.39285364747047424, + "num_tokens": 5151633715.0, + "step": 10077 + }, + { + "epoch": 2.7252568956192538, + "grad_norm": 4.1875, + "learning_rate": 0.010127210577280117, + "loss": 3.0797, + "mean_token_accuracy": 0.4009644389152527, + "num_tokens": 5152157896.0, + "step": 10078 + }, + { + "epoch": 2.7255273120605734, + "grad_norm": 3.234375, + "learning_rate": 0.010125641532000795, + "loss": 3.1723, + "mean_token_accuracy": 0.40385568141937256, + "num_tokens": 5152676838.0, + "step": 10079 + }, + { + "epoch": 2.725797728501893, + "grad_norm": 2.84375, + "learning_rate": 0.010124072513549366, + "loss": 3.0298, + "mean_token_accuracy": 0.3942508101463318, + "num_tokens": 5153200973.0, + "step": 10080 + }, + { + "epoch": 2.7260681449432127, + "grad_norm": 2.78125, + "learning_rate": 0.010122503521973965, + "loss": 2.9957, + "mean_token_accuracy": 0.4260028004646301, + "num_tokens": 5153724396.0, + "step": 10081 + }, + { + "epoch": 2.7263385613845323, + "grad_norm": 3.765625, + "learning_rate": 0.010120934557322735, + "loss": 3.1819, + "mean_token_accuracy": 0.414705753326416, + "num_tokens": 5154248556.0, + "step": 10082 + }, + { + "epoch": 2.726608977825852, + "grad_norm": 3.0, + "learning_rate": 0.010119365619643819, + "loss": 3.0191, + "mean_token_accuracy": 0.4125475287437439, + "num_tokens": 5154732796.0, + "step": 10083 + }, + { + "epoch": 2.726879394267171, + "grad_norm": 2.828125, + "learning_rate": 0.01011779670898536, + "loss": 3.0794, + "mean_token_accuracy": 0.4274614453315735, + "num_tokens": 5155243547.0, + "step": 10084 + }, + { + "epoch": 2.7271498107084913, + "grad_norm": 3.015625, + "learning_rate": 0.010116227825395485, + "loss": 3.1222, + "mean_token_accuracy": 0.40690481662750244, + "num_tokens": 5155767806.0, + "step": 10085 + }, + { + "epoch": 2.7274202271498105, + "grad_norm": 3.0, + "learning_rate": 0.010114658968922345, + "loss": 3.225, + "mean_token_accuracy": 0.4182562828063965, + "num_tokens": 5156252799.0, + "step": 10086 + }, + { + "epoch": 2.7276906435911306, + "grad_norm": 3.4375, + "learning_rate": 0.01011309013961407, + "loss": 3.3065, + "mean_token_accuracy": 0.38095271587371826, + "num_tokens": 5156776988.0, + "step": 10087 + }, + { + "epoch": 2.7279610600324498, + "grad_norm": 2.515625, + "learning_rate": 0.010111521337518796, + "loss": 3.0043, + "mean_token_accuracy": 0.4203549921512604, + "num_tokens": 5157301268.0, + "step": 10088 + }, + { + "epoch": 2.72823147647377, + "grad_norm": 3.078125, + "learning_rate": 0.010109952562684663, + "loss": 2.9102, + "mean_token_accuracy": 0.43326234817504883, + "num_tokens": 5157782158.0, + "step": 10089 + }, + { + "epoch": 2.728501892915089, + "grad_norm": 3.75, + "learning_rate": 0.010108383815159798, + "loss": 3.024, + "mean_token_accuracy": 0.4083596169948578, + "num_tokens": 5158306348.0, + "step": 10090 + }, + { + "epoch": 2.728772309356409, + "grad_norm": 1.328125, + "learning_rate": 0.010106815094992345, + "loss": 11.0513, + "mean_token_accuracy": 2.289503754582256e-05, + "num_tokens": 5158830627.0, + "step": 10091 + }, + { + "epoch": 2.7290427257977283, + "grad_norm": 7.3125, + "learning_rate": 0.010105246402230433, + "loss": 3.2762, + "mean_token_accuracy": 0.3992394208908081, + "num_tokens": 5159349397.0, + "step": 10092 + }, + { + "epoch": 2.729313142239048, + "grad_norm": 2.28125, + "learning_rate": 0.010103677736922185, + "loss": 3.0115, + "mean_token_accuracy": 0.4239996671676636, + "num_tokens": 5159873635.0, + "step": 10093 + }, + { + "epoch": 2.7295835586803676, + "grad_norm": 3.25, + "learning_rate": 0.010102109099115747, + "loss": 3.3192, + "mean_token_accuracy": 0.3846094012260437, + "num_tokens": 5160397776.0, + "step": 10094 + }, + { + "epoch": 2.7298539751216873, + "grad_norm": 2.921875, + "learning_rate": 0.01010054048885924, + "loss": 3.1778, + "mean_token_accuracy": 0.37888532876968384, + "num_tokens": 5160921979.0, + "step": 10095 + }, + { + "epoch": 2.730124391563007, + "grad_norm": 3.0625, + "learning_rate": 0.010098971906200793, + "loss": 3.0985, + "mean_token_accuracy": 0.4065917730331421, + "num_tokens": 5161446260.0, + "step": 10096 + }, + { + "epoch": 2.7303948080043265, + "grad_norm": 3.703125, + "learning_rate": 0.010097403351188541, + "loss": 3.3807, + "mean_token_accuracy": 0.38640791177749634, + "num_tokens": 5161970408.0, + "step": 10097 + }, + { + "epoch": 2.730665224445646, + "grad_norm": 3.09375, + "learning_rate": 0.010095834823870609, + "loss": 3.1515, + "mean_token_accuracy": 0.3578541576862335, + "num_tokens": 5162494654.0, + "step": 10098 + }, + { + "epoch": 2.730935640886966, + "grad_norm": 3.515625, + "learning_rate": 0.010094266324295121, + "loss": 3.1456, + "mean_token_accuracy": 0.40093305706977844, + "num_tokens": 5163018729.0, + "step": 10099 + }, + { + "epoch": 2.7312060573282855, + "grad_norm": 3.203125, + "learning_rate": 0.01009269785251021, + "loss": 3.1553, + "mean_token_accuracy": 0.42592042684555054, + "num_tokens": 5163484856.0, + "step": 10100 + }, + { + "epoch": 2.731476473769605, + "grad_norm": 2.71875, + "learning_rate": 0.010091129408563994, + "loss": 3.2125, + "mean_token_accuracy": 0.40154045820236206, + "num_tokens": 5164009086.0, + "step": 10101 + }, + { + "epoch": 2.7317468902109248, + "grad_norm": 2.8125, + "learning_rate": 0.010089560992504598, + "loss": 3.0581, + "mean_token_accuracy": 0.4134131669998169, + "num_tokens": 5164533356.0, + "step": 10102 + }, + { + "epoch": 2.7320173066522444, + "grad_norm": 3.125, + "learning_rate": 0.010087992604380152, + "loss": 3.0455, + "mean_token_accuracy": 0.3911210894584656, + "num_tokens": 5165046683.0, + "step": 10103 + }, + { + "epoch": 2.732287723093564, + "grad_norm": 3.1875, + "learning_rate": 0.010086424244238775, + "loss": 3.0993, + "mean_token_accuracy": 0.4229355454444885, + "num_tokens": 5165569152.0, + "step": 10104 + }, + { + "epoch": 2.7325581395348837, + "grad_norm": 3.40625, + "learning_rate": 0.010084855912128585, + "loss": 2.9773, + "mean_token_accuracy": 0.4180414378643036, + "num_tokens": 5166093118.0, + "step": 10105 + }, + { + "epoch": 2.7328285559762033, + "grad_norm": 2.9375, + "learning_rate": 0.010083287608097711, + "loss": 3.1458, + "mean_token_accuracy": 0.4092770516872406, + "num_tokens": 5166583765.0, + "step": 10106 + }, + { + "epoch": 2.733098972417523, + "grad_norm": 3.34375, + "learning_rate": 0.01008171933219427, + "loss": 2.9437, + "mean_token_accuracy": 0.4028931260108948, + "num_tokens": 5167108042.0, + "step": 10107 + }, + { + "epoch": 2.7333693888588426, + "grad_norm": 3.171875, + "learning_rate": 0.010080151084466376, + "loss": 2.9597, + "mean_token_accuracy": 0.42557334899902344, + "num_tokens": 5167632213.0, + "step": 10108 + }, + { + "epoch": 2.7336398053001623, + "grad_norm": 2.796875, + "learning_rate": 0.010078582864962153, + "loss": 3.0782, + "mean_token_accuracy": 0.42226442694664, + "num_tokens": 5168149166.0, + "step": 10109 + }, + { + "epoch": 2.733910221741482, + "grad_norm": 3.5, + "learning_rate": 0.01007701467372972, + "loss": 3.1985, + "mean_token_accuracy": 0.39380359649658203, + "num_tokens": 5168673386.0, + "step": 10110 + }, + { + "epoch": 2.7341806381828015, + "grad_norm": 50.5, + "learning_rate": 0.010075446510817187, + "loss": 18.6335, + "mean_token_accuracy": 0.03498067334294319, + "num_tokens": 5169197439.0, + "step": 10111 + }, + { + "epoch": 2.734451054624121, + "grad_norm": 15.75, + "learning_rate": 0.010073878376272678, + "loss": 3.0841, + "mean_token_accuracy": 0.43780893087387085, + "num_tokens": 5169664851.0, + "step": 10112 + }, + { + "epoch": 2.734721471065441, + "grad_norm": 3.140625, + "learning_rate": 0.0100723102701443, + "loss": 3.3004, + "mean_token_accuracy": 0.3832883834838867, + "num_tokens": 5170189055.0, + "step": 10113 + }, + { + "epoch": 2.7349918875067605, + "grad_norm": 2.75, + "learning_rate": 0.010070742192480176, + "loss": 3.2003, + "mean_token_accuracy": 0.40344077348709106, + "num_tokens": 5170651604.0, + "step": 10114 + }, + { + "epoch": 2.73526230394808, + "grad_norm": 4.34375, + "learning_rate": 0.010069174143328412, + "loss": 3.2932, + "mean_token_accuracy": 0.3881693482398987, + "num_tokens": 5171171658.0, + "step": 10115 + }, + { + "epoch": 2.7355327203893998, + "grad_norm": 3.9375, + "learning_rate": 0.01006760612273712, + "loss": 3.1151, + "mean_token_accuracy": 0.40463384985923767, + "num_tokens": 5171695927.0, + "step": 10116 + }, + { + "epoch": 2.7358031368307194, + "grad_norm": 3.09375, + "learning_rate": 0.010066038130754415, + "loss": 3.0133, + "mean_token_accuracy": 0.4074156880378723, + "num_tokens": 5172220192.0, + "step": 10117 + }, + { + "epoch": 2.736073553272039, + "grad_norm": 2.625, + "learning_rate": 0.010064470167428412, + "loss": 2.9991, + "mean_token_accuracy": 0.414741575717926, + "num_tokens": 5172706436.0, + "step": 10118 + }, + { + "epoch": 2.7363439697133587, + "grad_norm": 3.046875, + "learning_rate": 0.01006290223280721, + "loss": 3.2439, + "mean_token_accuracy": 0.3840988278388977, + "num_tokens": 5173230562.0, + "step": 10119 + }, + { + "epoch": 2.7366143861546783, + "grad_norm": 2.609375, + "learning_rate": 0.010061334326938927, + "loss": 3.0682, + "mean_token_accuracy": 0.3915385603904724, + "num_tokens": 5173747243.0, + "step": 10120 + }, + { + "epoch": 2.736884802595998, + "grad_norm": 2.53125, + "learning_rate": 0.010059766449871665, + "loss": 3.0616, + "mean_token_accuracy": 0.4053409993648529, + "num_tokens": 5174271492.0, + "step": 10121 + }, + { + "epoch": 2.7371552190373176, + "grad_norm": 2.703125, + "learning_rate": 0.010058198601653532, + "loss": 2.9833, + "mean_token_accuracy": 0.4221208989620209, + "num_tokens": 5174772033.0, + "step": 10122 + }, + { + "epoch": 2.7374256354786373, + "grad_norm": 3.421875, + "learning_rate": 0.01005663078233264, + "loss": 3.1796, + "mean_token_accuracy": 0.3991064131259918, + "num_tokens": 5175296108.0, + "step": 10123 + }, + { + "epoch": 2.737696051919957, + "grad_norm": 3.859375, + "learning_rate": 0.010055062991957088, + "loss": 3.1631, + "mean_token_accuracy": 0.3821619749069214, + "num_tokens": 5175820281.0, + "step": 10124 + }, + { + "epoch": 2.737966468361276, + "grad_norm": 3.046875, + "learning_rate": 0.010053495230574978, + "loss": 3.1521, + "mean_token_accuracy": 0.40833452343940735, + "num_tokens": 5176303311.0, + "step": 10125 + }, + { + "epoch": 2.738236884802596, + "grad_norm": 3.296875, + "learning_rate": 0.010051927498234423, + "loss": 3.0535, + "mean_token_accuracy": 0.4317775070667267, + "num_tokens": 5176762225.0, + "step": 10126 + }, + { + "epoch": 2.7385073012439154, + "grad_norm": 3.21875, + "learning_rate": 0.010050359794983522, + "loss": 3.1744, + "mean_token_accuracy": 0.4030250310897827, + "num_tokens": 5177286414.0, + "step": 10127 + }, + { + "epoch": 2.7387777176852355, + "grad_norm": 2.984375, + "learning_rate": 0.010048792120870367, + "loss": 2.9461, + "mean_token_accuracy": 0.40659862756729126, + "num_tokens": 5177810601.0, + "step": 10128 + }, + { + "epoch": 2.7390481341265547, + "grad_norm": 2.96875, + "learning_rate": 0.010047224475943074, + "loss": 2.8333, + "mean_token_accuracy": 0.4209796190261841, + "num_tokens": 5178334800.0, + "step": 10129 + }, + { + "epoch": 2.7393185505678748, + "grad_norm": 3.25, + "learning_rate": 0.010045656860249734, + "loss": 3.2179, + "mean_token_accuracy": 0.39382749795913696, + "num_tokens": 5178859069.0, + "step": 10130 + }, + { + "epoch": 2.739588967009194, + "grad_norm": 48.25, + "learning_rate": 0.010044089273838448, + "loss": 18.1427, + "mean_token_accuracy": 0.0, + "num_tokens": 5179374201.0, + "step": 10131 + }, + { + "epoch": 2.739859383450514, + "grad_norm": 7.40625, + "learning_rate": 0.010042521716757314, + "loss": 3.4329, + "mean_token_accuracy": 0.3545107841491699, + "num_tokens": 5179898385.0, + "step": 10132 + }, + { + "epoch": 2.7401297998918333, + "grad_norm": 2.359375, + "learning_rate": 0.01004095418905443, + "loss": 3.2858, + "mean_token_accuracy": 0.38252824544906616, + "num_tokens": 5180422660.0, + "step": 10133 + }, + { + "epoch": 2.740400216333153, + "grad_norm": 2.859375, + "learning_rate": 0.010039386690777889, + "loss": 3.0619, + "mean_token_accuracy": 0.40867549180984497, + "num_tokens": 5180946878.0, + "step": 10134 + }, + { + "epoch": 2.7406706327744725, + "grad_norm": 2.78125, + "learning_rate": 0.010037819221975795, + "loss": 2.9978, + "mean_token_accuracy": 0.41303837299346924, + "num_tokens": 5181471108.0, + "step": 10135 + }, + { + "epoch": 2.740941049215792, + "grad_norm": 3.875, + "learning_rate": 0.01003625178269623, + "loss": 3.0501, + "mean_token_accuracy": 0.40567660331726074, + "num_tokens": 5181995336.0, + "step": 10136 + }, + { + "epoch": 2.741211465657112, + "grad_norm": 3.53125, + "learning_rate": 0.010034684372987301, + "loss": 2.8258, + "mean_token_accuracy": 0.4448569416999817, + "num_tokens": 5182519604.0, + "step": 10137 + }, + { + "epoch": 2.7414818820984315, + "grad_norm": 3.03125, + "learning_rate": 0.01003311699289709, + "loss": 3.1275, + "mean_token_accuracy": 0.4095320403575897, + "num_tokens": 5183018848.0, + "step": 10138 + }, + { + "epoch": 2.741752298539751, + "grad_norm": 2.4375, + "learning_rate": 0.010031549642473696, + "loss": 3.0385, + "mean_token_accuracy": 0.39841389656066895, + "num_tokens": 5183543018.0, + "step": 10139 + }, + { + "epoch": 2.7420227149810708, + "grad_norm": 2.546875, + "learning_rate": 0.010029982321765205, + "loss": 3.0912, + "mean_token_accuracy": 0.4252548813819885, + "num_tokens": 5183985994.0, + "step": 10140 + }, + { + "epoch": 2.7422931314223904, + "grad_norm": 3.0625, + "learning_rate": 0.01002841503081971, + "loss": 2.9065, + "mean_token_accuracy": 0.42450910806655884, + "num_tokens": 5184471599.0, + "step": 10141 + }, + { + "epoch": 2.74256354786371, + "grad_norm": 2.859375, + "learning_rate": 0.010026847769685297, + "loss": 3.0599, + "mean_token_accuracy": 0.40094491839408875, + "num_tokens": 5184963981.0, + "step": 10142 + }, + { + "epoch": 2.7428339643050297, + "grad_norm": 3.203125, + "learning_rate": 0.01002528053841006, + "loss": 2.9762, + "mean_token_accuracy": 0.3972405791282654, + "num_tokens": 5185488187.0, + "step": 10143 + }, + { + "epoch": 2.7431043807463493, + "grad_norm": 2.578125, + "learning_rate": 0.01002371333704208, + "loss": 3.1209, + "mean_token_accuracy": 0.3994075655937195, + "num_tokens": 5185988297.0, + "step": 10144 + }, + { + "epoch": 2.743374797187669, + "grad_norm": 3.578125, + "learning_rate": 0.010022146165629447, + "loss": 3.1281, + "mean_token_accuracy": 0.3840116858482361, + "num_tokens": 5186512458.0, + "step": 10145 + }, + { + "epoch": 2.7436452136289886, + "grad_norm": 2.46875, + "learning_rate": 0.010020579024220244, + "loss": 2.9478, + "mean_token_accuracy": 0.4031009078025818, + "num_tokens": 5187002771.0, + "step": 10146 + }, + { + "epoch": 2.7439156300703083, + "grad_norm": 2.546875, + "learning_rate": 0.01001901191286256, + "loss": 3.2753, + "mean_token_accuracy": 0.39528387784957886, + "num_tokens": 5187486100.0, + "step": 10147 + }, + { + "epoch": 2.744186046511628, + "grad_norm": 4.1875, + "learning_rate": 0.01001744483160447, + "loss": 3.1854, + "mean_token_accuracy": 0.4019015431404114, + "num_tokens": 5188010346.0, + "step": 10148 + }, + { + "epoch": 2.7444564629529475, + "grad_norm": 2.625, + "learning_rate": 0.010015877780494066, + "loss": 3.0438, + "mean_token_accuracy": 0.3901465833187103, + "num_tokens": 5188534545.0, + "step": 10149 + }, + { + "epoch": 2.744726879394267, + "grad_norm": 2.515625, + "learning_rate": 0.010014310759579427, + "loss": 3.1371, + "mean_token_accuracy": 0.4029122591018677, + "num_tokens": 5189052687.0, + "step": 10150 + }, + { + "epoch": 2.744997295835587, + "grad_norm": 36.5, + "learning_rate": 0.01001274376890863, + "loss": 11.6003, + "mean_token_accuracy": 0.013949130661785603, + "num_tokens": 5189576929.0, + "step": 10151 + }, + { + "epoch": 2.7452677122769065, + "grad_norm": 5.78125, + "learning_rate": 0.010011176808529755, + "loss": 3.3134, + "mean_token_accuracy": 0.3868025839328766, + "num_tokens": 5190054460.0, + "step": 10152 + }, + { + "epoch": 2.745538128718226, + "grad_norm": 2.421875, + "learning_rate": 0.010009609878490889, + "loss": 3.2453, + "mean_token_accuracy": 0.40104979276657104, + "num_tokens": 5190560828.0, + "step": 10153 + }, + { + "epoch": 2.7458085451595458, + "grad_norm": 3.21875, + "learning_rate": 0.0100080429788401, + "loss": 2.9924, + "mean_token_accuracy": 0.4145967364311218, + "num_tokens": 5191059889.0, + "step": 10154 + }, + { + "epoch": 2.7460789616008654, + "grad_norm": 2.90625, + "learning_rate": 0.010006476109625474, + "loss": 3.1788, + "mean_token_accuracy": 0.3931387960910797, + "num_tokens": 5191584031.0, + "step": 10155 + }, + { + "epoch": 2.746349378042185, + "grad_norm": 3.5, + "learning_rate": 0.010004909270895082, + "loss": 3.287, + "mean_token_accuracy": 0.3954964876174927, + "num_tokens": 5192102001.0, + "step": 10156 + }, + { + "epoch": 2.7466197944835047, + "grad_norm": 3.0625, + "learning_rate": 0.010003342462696994, + "loss": 3.1056, + "mean_token_accuracy": 0.41851741075515747, + "num_tokens": 5192568309.0, + "step": 10157 + }, + { + "epoch": 2.7468902109248243, + "grad_norm": 7.53125, + "learning_rate": 0.010001775685079298, + "loss": 2.7669, + "mean_token_accuracy": 0.43939465284347534, + "num_tokens": 5193092586.0, + "step": 10158 + }, + { + "epoch": 2.747160627366144, + "grad_norm": 2.328125, + "learning_rate": 0.010000208938090054, + "loss": 3.29, + "mean_token_accuracy": 0.4131769835948944, + "num_tokens": 5193569813.0, + "step": 10159 + }, + { + "epoch": 2.7474310438074636, + "grad_norm": 2.65625, + "learning_rate": 0.009998642221777338, + "loss": 3.1853, + "mean_token_accuracy": 0.3928750455379486, + "num_tokens": 5194032640.0, + "step": 10160 + }, + { + "epoch": 2.7477014602487833, + "grad_norm": 2.59375, + "learning_rate": 0.009997075536189231, + "loss": 2.8986, + "mean_token_accuracy": 0.4170682430267334, + "num_tokens": 5194525662.0, + "step": 10161 + }, + { + "epoch": 2.747971876690103, + "grad_norm": 3.734375, + "learning_rate": 0.00999550888137379, + "loss": 2.9767, + "mean_token_accuracy": 0.4176369309425354, + "num_tokens": 5195049829.0, + "step": 10162 + }, + { + "epoch": 2.7482422931314225, + "grad_norm": 3.375, + "learning_rate": 0.009993942257379094, + "loss": 3.079, + "mean_token_accuracy": 0.4119196832180023, + "num_tokens": 5195519746.0, + "step": 10163 + }, + { + "epoch": 2.748512709572742, + "grad_norm": 3.28125, + "learning_rate": 0.009992375664253208, + "loss": 3.0397, + "mean_token_accuracy": 0.41906529664993286, + "num_tokens": 5196026805.0, + "step": 10164 + }, + { + "epoch": 2.748783126014062, + "grad_norm": 3.25, + "learning_rate": 0.009990809102044196, + "loss": 3.1907, + "mean_token_accuracy": 0.38190576434135437, + "num_tokens": 5196550972.0, + "step": 10165 + }, + { + "epoch": 2.749053542455381, + "grad_norm": 2.984375, + "learning_rate": 0.009989242570800135, + "loss": 3.2599, + "mean_token_accuracy": 0.37297433614730835, + "num_tokens": 5197075113.0, + "step": 10166 + }, + { + "epoch": 2.749323958896701, + "grad_norm": 2.484375, + "learning_rate": 0.009987676070569081, + "loss": 3.1739, + "mean_token_accuracy": 0.4079963266849518, + "num_tokens": 5197586713.0, + "step": 10167 + }, + { + "epoch": 2.7495943753380203, + "grad_norm": 2.875, + "learning_rate": 0.009986109601399099, + "loss": 3.002, + "mean_token_accuracy": 0.3912777304649353, + "num_tokens": 5198110906.0, + "step": 10168 + }, + { + "epoch": 2.7498647917793404, + "grad_norm": 2.78125, + "learning_rate": 0.00998454316333826, + "loss": 3.1148, + "mean_token_accuracy": 0.4010344445705414, + "num_tokens": 5198635032.0, + "step": 10169 + }, + { + "epoch": 2.7501352082206596, + "grad_norm": 3.0, + "learning_rate": 0.009982976756434626, + "loss": 2.9119, + "mean_token_accuracy": 0.41330206394195557, + "num_tokens": 5199117015.0, + "step": 10170 + }, + { + "epoch": 2.7504056246619797, + "grad_norm": 96.5, + "learning_rate": 0.009981410380736246, + "loss": 11.2399, + "mean_token_accuracy": 0.03822324424982071, + "num_tokens": 5199641268.0, + "step": 10171 + }, + { + "epoch": 2.750676041103299, + "grad_norm": 5.75, + "learning_rate": 0.0099798440362912, + "loss": 3.2317, + "mean_token_accuracy": 0.3847832679748535, + "num_tokens": 5200165397.0, + "step": 10172 + }, + { + "epoch": 2.750946457544619, + "grad_norm": 2.109375, + "learning_rate": 0.009978277723147537, + "loss": 3.1587, + "mean_token_accuracy": 0.3904581069946289, + "num_tokens": 5200689662.0, + "step": 10173 + }, + { + "epoch": 2.751216873985938, + "grad_norm": 2.484375, + "learning_rate": 0.009976711441353316, + "loss": 3.2641, + "mean_token_accuracy": 0.42221394181251526, + "num_tokens": 5201150106.0, + "step": 10174 + }, + { + "epoch": 2.751487290427258, + "grad_norm": 3.40625, + "learning_rate": 0.009975145190956599, + "loss": 3.1054, + "mean_token_accuracy": 0.4045228958129883, + "num_tokens": 5201674382.0, + "step": 10175 + }, + { + "epoch": 2.7517577068685775, + "grad_norm": 2.46875, + "learning_rate": 0.009973578972005442, + "loss": 3.0191, + "mean_token_accuracy": 0.4162331521511078, + "num_tokens": 5202196570.0, + "step": 10176 + }, + { + "epoch": 2.752028123309897, + "grad_norm": 2.71875, + "learning_rate": 0.009972012784547898, + "loss": 3.2947, + "mean_token_accuracy": 0.41280651092529297, + "num_tokens": 5202710365.0, + "step": 10177 + }, + { + "epoch": 2.7522985397512167, + "grad_norm": 2.71875, + "learning_rate": 0.009970446628632028, + "loss": 3.0787, + "mean_token_accuracy": 0.4177495837211609, + "num_tokens": 5203178904.0, + "step": 10178 + }, + { + "epoch": 2.7525689561925364, + "grad_norm": 2.75, + "learning_rate": 0.009968880504305881, + "loss": 3.1118, + "mean_token_accuracy": 0.40847522020339966, + "num_tokens": 5203702997.0, + "step": 10179 + }, + { + "epoch": 2.752839372633856, + "grad_norm": 3.8125, + "learning_rate": 0.009967314411617514, + "loss": 3.0581, + "mean_token_accuracy": 0.40985918045043945, + "num_tokens": 5204203386.0, + "step": 10180 + }, + { + "epoch": 2.7531097890751757, + "grad_norm": 3.5625, + "learning_rate": 0.009965748350614976, + "loss": 3.2397, + "mean_token_accuracy": 0.3996436893939972, + "num_tokens": 5204727544.0, + "step": 10181 + }, + { + "epoch": 2.7533802055164953, + "grad_norm": 3.0625, + "learning_rate": 0.009964182321346323, + "loss": 3.0705, + "mean_token_accuracy": 0.3978973627090454, + "num_tokens": 5205226899.0, + "step": 10182 + }, + { + "epoch": 2.753650621957815, + "grad_norm": 2.3125, + "learning_rate": 0.009962616323859598, + "loss": 2.7682, + "mean_token_accuracy": 0.4367847740650177, + "num_tokens": 5205742332.0, + "step": 10183 + }, + { + "epoch": 2.7539210383991346, + "grad_norm": 3.0625, + "learning_rate": 0.009961050358202859, + "loss": 3.0001, + "mean_token_accuracy": 0.39831632375717163, + "num_tokens": 5206266356.0, + "step": 10184 + }, + { + "epoch": 2.7541914548404542, + "grad_norm": 2.421875, + "learning_rate": 0.009959484424424146, + "loss": 3.0392, + "mean_token_accuracy": 0.41450196504592896, + "num_tokens": 5206790481.0, + "step": 10185 + }, + { + "epoch": 2.754461871281774, + "grad_norm": 2.59375, + "learning_rate": 0.009957918522571517, + "loss": 2.8095, + "mean_token_accuracy": 0.4284304976463318, + "num_tokens": 5207294485.0, + "step": 10186 + }, + { + "epoch": 2.7547322877230935, + "grad_norm": 2.296875, + "learning_rate": 0.009956352652693009, + "loss": 3.0103, + "mean_token_accuracy": 0.40797409415245056, + "num_tokens": 5207799164.0, + "step": 10187 + }, + { + "epoch": 2.755002704164413, + "grad_norm": 2.4375, + "learning_rate": 0.009954786814836669, + "loss": 3.0906, + "mean_token_accuracy": 0.4071537256240845, + "num_tokens": 5208271556.0, + "step": 10188 + }, + { + "epoch": 2.755273120605733, + "grad_norm": 2.984375, + "learning_rate": 0.009953221009050543, + "loss": 2.9049, + "mean_token_accuracy": 0.4177720248699188, + "num_tokens": 5208795730.0, + "step": 10189 + }, + { + "epoch": 2.7555435370470525, + "grad_norm": 2.9375, + "learning_rate": 0.009951655235382679, + "loss": 3.1278, + "mean_token_accuracy": 0.4105064868927002, + "num_tokens": 5209319972.0, + "step": 10190 + }, + { + "epoch": 2.755813953488372, + "grad_norm": 33.25, + "learning_rate": 0.009950089493881112, + "loss": 10.7041, + "mean_token_accuracy": 0.011781531386077404, + "num_tokens": 5209844119.0, + "step": 10191 + }, + { + "epoch": 2.7560843699296917, + "grad_norm": 6.8125, + "learning_rate": 0.00994852378459389, + "loss": 3.2766, + "mean_token_accuracy": 0.35007229447364807, + "num_tokens": 5210368356.0, + "step": 10192 + }, + { + "epoch": 2.7563547863710114, + "grad_norm": 2.328125, + "learning_rate": 0.009946958107569051, + "loss": 3.1731, + "mean_token_accuracy": 0.3748520612716675, + "num_tokens": 5210892601.0, + "step": 10193 + }, + { + "epoch": 2.756625202812331, + "grad_norm": 3.390625, + "learning_rate": 0.009945392462854627, + "loss": 2.9022, + "mean_token_accuracy": 0.46921306848526, + "num_tokens": 5211415809.0, + "step": 10194 + }, + { + "epoch": 2.7568956192536507, + "grad_norm": 3.578125, + "learning_rate": 0.009943826850498669, + "loss": 3.2413, + "mean_token_accuracy": 0.3806297183036804, + "num_tokens": 5211939962.0, + "step": 10195 + }, + { + "epoch": 2.7571660356949703, + "grad_norm": 3.4375, + "learning_rate": 0.00994226127054921, + "loss": 3.1319, + "mean_token_accuracy": 0.3928765654563904, + "num_tokens": 5212464238.0, + "step": 10196 + }, + { + "epoch": 2.75743645213629, + "grad_norm": 4.71875, + "learning_rate": 0.009940695723054283, + "loss": 2.8865, + "mean_token_accuracy": 0.4097507894039154, + "num_tokens": 5212988373.0, + "step": 10197 + }, + { + "epoch": 2.7577068685776096, + "grad_norm": 3.078125, + "learning_rate": 0.009939130208061931, + "loss": 3.0791, + "mean_token_accuracy": 0.39234161376953125, + "num_tokens": 5213497516.0, + "step": 10198 + }, + { + "epoch": 2.7579772850189292, + "grad_norm": 3.3125, + "learning_rate": 0.009937564725620181, + "loss": 3.1699, + "mean_token_accuracy": 0.39828312397003174, + "num_tokens": 5214021790.0, + "step": 10199 + }, + { + "epoch": 2.758247701460249, + "grad_norm": 2.640625, + "learning_rate": 0.009935999275777067, + "loss": 3.1132, + "mean_token_accuracy": 0.42042437195777893, + "num_tokens": 5214506727.0, + "step": 10200 + }, + { + "epoch": 2.7585181179015685, + "grad_norm": 3.640625, + "learning_rate": 0.009934433858580628, + "loss": 3.1328, + "mean_token_accuracy": 0.39645835757255554, + "num_tokens": 5215030934.0, + "step": 10201 + }, + { + "epoch": 2.758788534342888, + "grad_norm": 2.6875, + "learning_rate": 0.009932868474078892, + "loss": 3.2219, + "mean_token_accuracy": 0.4026443660259247, + "num_tokens": 5215555192.0, + "step": 10202 + }, + { + "epoch": 2.759058950784208, + "grad_norm": 3.28125, + "learning_rate": 0.009931303122319884, + "loss": 3.11, + "mean_token_accuracy": 0.41368719935417175, + "num_tokens": 5216079357.0, + "step": 10203 + }, + { + "epoch": 2.7593293672255275, + "grad_norm": 2.625, + "learning_rate": 0.009929737803351647, + "loss": 3.116, + "mean_token_accuracy": 0.4138783812522888, + "num_tokens": 5216603636.0, + "step": 10204 + }, + { + "epoch": 2.759599783666847, + "grad_norm": 3.0625, + "learning_rate": 0.009928172517222198, + "loss": 3.1869, + "mean_token_accuracy": 0.4042302966117859, + "num_tokens": 5217102837.0, + "step": 10205 + }, + { + "epoch": 2.7598702001081667, + "grad_norm": 2.75, + "learning_rate": 0.009926607263979566, + "loss": 3.2228, + "mean_token_accuracy": 0.3788004219532013, + "num_tokens": 5217627087.0, + "step": 10206 + }, + { + "epoch": 2.760140616549486, + "grad_norm": 2.46875, + "learning_rate": 0.009925042043671782, + "loss": 3.0062, + "mean_token_accuracy": 0.4081502854824066, + "num_tokens": 5218151367.0, + "step": 10207 + }, + { + "epoch": 2.760411032990806, + "grad_norm": 2.359375, + "learning_rate": 0.009923476856346867, + "loss": 3.1256, + "mean_token_accuracy": 0.4096381962299347, + "num_tokens": 5218675614.0, + "step": 10208 + }, + { + "epoch": 2.7606814494321252, + "grad_norm": 2.703125, + "learning_rate": 0.009921911702052852, + "loss": 3.1143, + "mean_token_accuracy": 0.4137701094150543, + "num_tokens": 5219178023.0, + "step": 10209 + }, + { + "epoch": 2.7609518658734453, + "grad_norm": 2.890625, + "learning_rate": 0.009920346580837753, + "loss": 3.0852, + "mean_token_accuracy": 0.40697968006134033, + "num_tokens": 5219702165.0, + "step": 10210 + }, + { + "epoch": 2.7612222823147645, + "grad_norm": 37.5, + "learning_rate": 0.009918781492749596, + "loss": 9.6215, + "mean_token_accuracy": 0.009683882817626, + "num_tokens": 5220226409.0, + "step": 10211 + }, + { + "epoch": 2.7614926987560846, + "grad_norm": 7.1875, + "learning_rate": 0.009917216437836406, + "loss": 3.4495, + "mean_token_accuracy": 0.36591947078704834, + "num_tokens": 5220687257.0, + "step": 10212 + }, + { + "epoch": 2.761763115197404, + "grad_norm": 2.125, + "learning_rate": 0.009915651416146198, + "loss": 2.8698, + "mean_token_accuracy": 0.41576671600341797, + "num_tokens": 5221207116.0, + "step": 10213 + }, + { + "epoch": 2.762033531638724, + "grad_norm": 3.828125, + "learning_rate": 0.009914086427726992, + "loss": 3.3508, + "mean_token_accuracy": 0.37974753975868225, + "num_tokens": 5221731262.0, + "step": 10214 + }, + { + "epoch": 2.762303948080043, + "grad_norm": 3.734375, + "learning_rate": 0.00991252147262681, + "loss": 3.1856, + "mean_token_accuracy": 0.3843531906604767, + "num_tokens": 5222199896.0, + "step": 10215 + }, + { + "epoch": 2.7625743645213627, + "grad_norm": 3.40625, + "learning_rate": 0.009910956550893666, + "loss": 3.0705, + "mean_token_accuracy": 0.4215218424797058, + "num_tokens": 5222711340.0, + "step": 10216 + }, + { + "epoch": 2.7628447809626824, + "grad_norm": 3.546875, + "learning_rate": 0.009909391662575576, + "loss": 3.07, + "mean_token_accuracy": 0.4014590382575989, + "num_tokens": 5223235502.0, + "step": 10217 + }, + { + "epoch": 2.763115197404002, + "grad_norm": 2.625, + "learning_rate": 0.009907826807720559, + "loss": 3.1337, + "mean_token_accuracy": 0.4133825898170471, + "num_tokens": 5223759786.0, + "step": 10218 + }, + { + "epoch": 2.7633856138453217, + "grad_norm": 2.859375, + "learning_rate": 0.009906261986376631, + "loss": 2.9573, + "mean_token_accuracy": 0.4340379238128662, + "num_tokens": 5224264207.0, + "step": 10219 + }, + { + "epoch": 2.7636560302866413, + "grad_norm": 3.0625, + "learning_rate": 0.009904697198591795, + "loss": 3.1097, + "mean_token_accuracy": 0.41998937726020813, + "num_tokens": 5224762333.0, + "step": 10220 + }, + { + "epoch": 2.763926446727961, + "grad_norm": 2.953125, + "learning_rate": 0.009903132444414075, + "loss": 3.0518, + "mean_token_accuracy": 0.4011783003807068, + "num_tokens": 5225286477.0, + "step": 10221 + }, + { + "epoch": 2.7641968631692806, + "grad_norm": 2.734375, + "learning_rate": 0.009901567723891476, + "loss": 3.0094, + "mean_token_accuracy": 0.4023611545562744, + "num_tokens": 5225810715.0, + "step": 10222 + }, + { + "epoch": 2.7644672796106002, + "grad_norm": 4.9375, + "learning_rate": 0.009900003037072008, + "loss": 2.976, + "mean_token_accuracy": 0.4287753701210022, + "num_tokens": 5226268180.0, + "step": 10223 + }, + { + "epoch": 2.76473769605192, + "grad_norm": 1.890625, + "learning_rate": 0.00989843838400368, + "loss": 3.05, + "mean_token_accuracy": 0.41816186904907227, + "num_tokens": 5226792457.0, + "step": 10224 + }, + { + "epoch": 2.7650081124932395, + "grad_norm": 3.546875, + "learning_rate": 0.009896873764734509, + "loss": 2.8834, + "mean_token_accuracy": 0.4528743028640747, + "num_tokens": 5227316726.0, + "step": 10225 + }, + { + "epoch": 2.765278528934559, + "grad_norm": 3.359375, + "learning_rate": 0.009895309179312488, + "loss": 3.1569, + "mean_token_accuracy": 0.39712047576904297, + "num_tokens": 5227810757.0, + "step": 10226 + }, + { + "epoch": 2.765548945375879, + "grad_norm": 3.234375, + "learning_rate": 0.009893744627785635, + "loss": 3.3112, + "mean_token_accuracy": 0.39644330739974976, + "num_tokens": 5228335028.0, + "step": 10227 + }, + { + "epoch": 2.7658193618171985, + "grad_norm": 3.03125, + "learning_rate": 0.009892180110201947, + "loss": 3.0451, + "mean_token_accuracy": 0.388563871383667, + "num_tokens": 5228859306.0, + "step": 10228 + }, + { + "epoch": 2.766089778258518, + "grad_norm": 2.625, + "learning_rate": 0.00989061562660943, + "loss": 3.007, + "mean_token_accuracy": 0.4195069670677185, + "num_tokens": 5229380057.0, + "step": 10229 + }, + { + "epoch": 2.7663601946998377, + "grad_norm": 3.28125, + "learning_rate": 0.00988905117705609, + "loss": 3.1244, + "mean_token_accuracy": 0.39475172758102417, + "num_tokens": 5229904178.0, + "step": 10230 + }, + { + "epoch": 2.7666306111411574, + "grad_norm": 49.25, + "learning_rate": 0.009887486761589926, + "loss": 12.3395, + "mean_token_accuracy": 0.014524326659739017, + "num_tokens": 5230428432.0, + "step": 10231 + }, + { + "epoch": 2.766901027582477, + "grad_norm": 7.59375, + "learning_rate": 0.00988592238025894, + "loss": 3.3463, + "mean_token_accuracy": 0.3457275927066803, + "num_tokens": 5230952509.0, + "step": 10232 + }, + { + "epoch": 2.7671714440237967, + "grad_norm": 1.7734375, + "learning_rate": 0.009884358033111136, + "loss": 3.0189, + "mean_token_accuracy": 0.4090774655342102, + "num_tokens": 5231476794.0, + "step": 10233 + }, + { + "epoch": 2.7674418604651163, + "grad_norm": 1.8515625, + "learning_rate": 0.0098827937201945, + "loss": 3.18, + "mean_token_accuracy": 0.4149347245693207, + "num_tokens": 5231975809.0, + "step": 10234 + }, + { + "epoch": 2.767712276906436, + "grad_norm": 3.046875, + "learning_rate": 0.009881229441557044, + "loss": 3.1737, + "mean_token_accuracy": 0.38028883934020996, + "num_tokens": 5232500040.0, + "step": 10235 + }, + { + "epoch": 2.7679826933477556, + "grad_norm": 2.46875, + "learning_rate": 0.009879665197246762, + "loss": 2.8024, + "mean_token_accuracy": 0.4124782681465149, + "num_tokens": 5233024192.0, + "step": 10236 + }, + { + "epoch": 2.7682531097890752, + "grad_norm": 2.71875, + "learning_rate": 0.00987810098731164, + "loss": 3.1557, + "mean_token_accuracy": 0.3883178234100342, + "num_tokens": 5233548368.0, + "step": 10237 + }, + { + "epoch": 2.768523526230395, + "grad_norm": 2.546875, + "learning_rate": 0.00987653681179968, + "loss": 3.0465, + "mean_token_accuracy": 0.40870213508605957, + "num_tokens": 5234072649.0, + "step": 10238 + }, + { + "epoch": 2.7687939426717145, + "grad_norm": 2.859375, + "learning_rate": 0.009874972670758876, + "loss": 2.9512, + "mean_token_accuracy": 0.41005074977874756, + "num_tokens": 5234595900.0, + "step": 10239 + }, + { + "epoch": 2.769064359113034, + "grad_norm": 2.484375, + "learning_rate": 0.009873408564237218, + "loss": 3.1374, + "mean_token_accuracy": 0.40994036197662354, + "num_tokens": 5235114681.0, + "step": 10240 + }, + { + "epoch": 2.769334775554354, + "grad_norm": 4.15625, + "learning_rate": 0.0098718444922827, + "loss": 3.1938, + "mean_token_accuracy": 0.3874128460884094, + "num_tokens": 5235638964.0, + "step": 10241 + }, + { + "epoch": 2.7696051919956735, + "grad_norm": 1.9765625, + "learning_rate": 0.009870280454943313, + "loss": 2.9673, + "mean_token_accuracy": 0.431100070476532, + "num_tokens": 5236163126.0, + "step": 10242 + }, + { + "epoch": 2.769875608436993, + "grad_norm": 3.953125, + "learning_rate": 0.009868716452267038, + "loss": 3.1354, + "mean_token_accuracy": 0.40352851152420044, + "num_tokens": 5236653222.0, + "step": 10243 + }, + { + "epoch": 2.7701460248783127, + "grad_norm": 2.671875, + "learning_rate": 0.009867152484301872, + "loss": 3.1225, + "mean_token_accuracy": 0.4093848764896393, + "num_tokens": 5237177397.0, + "step": 10244 + }, + { + "epoch": 2.7704164413196324, + "grad_norm": 3.515625, + "learning_rate": 0.009865588551095799, + "loss": 3.112, + "mean_token_accuracy": 0.3765328526496887, + "num_tokens": 5237701676.0, + "step": 10245 + }, + { + "epoch": 2.770686857760952, + "grad_norm": 2.375, + "learning_rate": 0.009864024652696802, + "loss": 2.8826, + "mean_token_accuracy": 0.4209447503089905, + "num_tokens": 5238225961.0, + "step": 10246 + }, + { + "epoch": 2.7709572742022717, + "grad_norm": 3.21875, + "learning_rate": 0.009862460789152877, + "loss": 3.0578, + "mean_token_accuracy": 0.3898022472858429, + "num_tokens": 5238750056.0, + "step": 10247 + }, + { + "epoch": 2.771227690643591, + "grad_norm": 3.0625, + "learning_rate": 0.009860896960511996, + "loss": 2.9567, + "mean_token_accuracy": 0.4206257462501526, + "num_tokens": 5239214185.0, + "step": 10248 + }, + { + "epoch": 2.771498107084911, + "grad_norm": 3.046875, + "learning_rate": 0.009859333166822144, + "loss": 3.122, + "mean_token_accuracy": 0.41327184438705444, + "num_tokens": 5239738371.0, + "step": 10249 + }, + { + "epoch": 2.77176852352623, + "grad_norm": 3.40625, + "learning_rate": 0.009857769408131307, + "loss": 3.021, + "mean_token_accuracy": 0.4144822061061859, + "num_tokens": 5240262594.0, + "step": 10250 + }, + { + "epoch": 2.7720389399675502, + "grad_norm": 164.0, + "learning_rate": 0.009856205684487465, + "loss": 13.1157, + "mean_token_accuracy": 0.0037145549431443214, + "num_tokens": 5240783054.0, + "step": 10251 + }, + { + "epoch": 2.7723093564088694, + "grad_norm": 7.03125, + "learning_rate": 0.009854641995938591, + "loss": 3.5696, + "mean_token_accuracy": 0.3588516116142273, + "num_tokens": 5241248248.0, + "step": 10252 + }, + { + "epoch": 2.7725797728501895, + "grad_norm": 2.09375, + "learning_rate": 0.009853078342532675, + "loss": 3.0955, + "mean_token_accuracy": 0.4029155671596527, + "num_tokens": 5241772438.0, + "step": 10253 + }, + { + "epoch": 2.7728501892915087, + "grad_norm": 2.328125, + "learning_rate": 0.009851514724317683, + "loss": 2.9991, + "mean_token_accuracy": 0.41044116020202637, + "num_tokens": 5242296650.0, + "step": 10254 + }, + { + "epoch": 2.773120605732829, + "grad_norm": 3.46875, + "learning_rate": 0.009849951141341603, + "loss": 3.1814, + "mean_token_accuracy": 0.396570086479187, + "num_tokens": 5242820913.0, + "step": 10255 + }, + { + "epoch": 2.773391022174148, + "grad_norm": 2.984375, + "learning_rate": 0.009848387593652402, + "loss": 2.9302, + "mean_token_accuracy": 0.41303253173828125, + "num_tokens": 5243345113.0, + "step": 10256 + }, + { + "epoch": 2.7736614386154677, + "grad_norm": 3.359375, + "learning_rate": 0.009846824081298054, + "loss": 3.0736, + "mean_token_accuracy": 0.4154733419418335, + "num_tokens": 5243869365.0, + "step": 10257 + }, + { + "epoch": 2.7739318550567873, + "grad_norm": 2.6875, + "learning_rate": 0.009845260604326537, + "loss": 3.0104, + "mean_token_accuracy": 0.4001542329788208, + "num_tokens": 5244393586.0, + "step": 10258 + }, + { + "epoch": 2.774202271498107, + "grad_norm": 3.203125, + "learning_rate": 0.009843697162785818, + "loss": 3.1825, + "mean_token_accuracy": 0.39895305037498474, + "num_tokens": 5244917777.0, + "step": 10259 + }, + { + "epoch": 2.7744726879394266, + "grad_norm": 2.484375, + "learning_rate": 0.00984213375672387, + "loss": 3.0204, + "mean_token_accuracy": 0.41193902492523193, + "num_tokens": 5245441996.0, + "step": 10260 + }, + { + "epoch": 2.7747431043807462, + "grad_norm": 2.90625, + "learning_rate": 0.009840570386188666, + "loss": 3.0167, + "mean_token_accuracy": 0.39584481716156006, + "num_tokens": 5245966057.0, + "step": 10261 + }, + { + "epoch": 2.775013520822066, + "grad_norm": 3.0625, + "learning_rate": 0.009839007051228173, + "loss": 3.224, + "mean_token_accuracy": 0.3937700688838959, + "num_tokens": 5246458916.0, + "step": 10262 + }, + { + "epoch": 2.7752839372633855, + "grad_norm": 2.90625, + "learning_rate": 0.009837443751890353, + "loss": 2.9554, + "mean_token_accuracy": 0.41700977087020874, + "num_tokens": 5246983094.0, + "step": 10263 + }, + { + "epoch": 2.775554353704705, + "grad_norm": 2.96875, + "learning_rate": 0.009835880488223184, + "loss": 3.19, + "mean_token_accuracy": 0.4027438163757324, + "num_tokens": 5247507335.0, + "step": 10264 + }, + { + "epoch": 2.775824770146025, + "grad_norm": 3.5625, + "learning_rate": 0.009834317260274622, + "loss": 3.1844, + "mean_token_accuracy": 0.3885965347290039, + "num_tokens": 5248031497.0, + "step": 10265 + }, + { + "epoch": 2.7760951865873444, + "grad_norm": 2.546875, + "learning_rate": 0.009832754068092633, + "loss": 2.8896, + "mean_token_accuracy": 0.4269102215766907, + "num_tokens": 5248555745.0, + "step": 10266 + }, + { + "epoch": 2.776365603028664, + "grad_norm": 3.171875, + "learning_rate": 0.009831190911725183, + "loss": 2.9725, + "mean_token_accuracy": 0.41196703910827637, + "num_tokens": 5249079947.0, + "step": 10267 + }, + { + "epoch": 2.7766360194699837, + "grad_norm": 2.828125, + "learning_rate": 0.009829627791220234, + "loss": 2.942, + "mean_token_accuracy": 0.4114665389060974, + "num_tokens": 5249604190.0, + "step": 10268 + }, + { + "epoch": 2.7769064359113034, + "grad_norm": 2.703125, + "learning_rate": 0.009828064706625743, + "loss": 3.0529, + "mean_token_accuracy": 0.4076542854309082, + "num_tokens": 5250128422.0, + "step": 10269 + }, + { + "epoch": 2.777176852352623, + "grad_norm": 2.515625, + "learning_rate": 0.009826501657989676, + "loss": 2.9762, + "mean_token_accuracy": 0.4422174096107483, + "num_tokens": 5250567947.0, + "step": 10270 + }, + { + "epoch": 2.7774472687939427, + "grad_norm": 99.0, + "learning_rate": 0.009824938645359993, + "loss": 14.4396, + "mean_token_accuracy": 1.623397838557139e-05, + "num_tokens": 5251053853.0, + "step": 10271 + }, + { + "epoch": 2.7777176852352623, + "grad_norm": 6.90625, + "learning_rate": 0.00982337566878464, + "loss": 3.4951, + "mean_token_accuracy": 0.36190253496170044, + "num_tokens": 5251543179.0, + "step": 10272 + }, + { + "epoch": 2.777988101676582, + "grad_norm": 2.3125, + "learning_rate": 0.009821812728311586, + "loss": 3.1809, + "mean_token_accuracy": 0.4156379997730255, + "num_tokens": 5252005645.0, + "step": 10273 + }, + { + "epoch": 2.7782585181179016, + "grad_norm": 2.875, + "learning_rate": 0.009820249823988785, + "loss": 3.2351, + "mean_token_accuracy": 0.4078682065010071, + "num_tokens": 5252513077.0, + "step": 10274 + }, + { + "epoch": 2.7785289345592212, + "grad_norm": 2.828125, + "learning_rate": 0.009818686955864183, + "loss": 3.025, + "mean_token_accuracy": 0.40705791115760803, + "num_tokens": 5253037064.0, + "step": 10275 + }, + { + "epoch": 2.778799351000541, + "grad_norm": 2.953125, + "learning_rate": 0.009817124123985742, + "loss": 3.1398, + "mean_token_accuracy": 0.4102747440338135, + "num_tokens": 5253561236.0, + "step": 10276 + }, + { + "epoch": 2.7790697674418605, + "grad_norm": 4.1875, + "learning_rate": 0.00981556132840141, + "loss": 3.2428, + "mean_token_accuracy": 0.4099477529525757, + "num_tokens": 5254085342.0, + "step": 10277 + }, + { + "epoch": 2.77934018388318, + "grad_norm": 2.9375, + "learning_rate": 0.009813998569159144, + "loss": 3.1081, + "mean_token_accuracy": 0.42186617851257324, + "num_tokens": 5254569709.0, + "step": 10278 + }, + { + "epoch": 2.7796106003245, + "grad_norm": 3.03125, + "learning_rate": 0.009812435846306884, + "loss": 3.1865, + "mean_token_accuracy": 0.41068679094314575, + "num_tokens": 5255059827.0, + "step": 10279 + }, + { + "epoch": 2.7798810167658194, + "grad_norm": 3.25, + "learning_rate": 0.009810873159892588, + "loss": 3.1254, + "mean_token_accuracy": 0.3999026119709015, + "num_tokens": 5255584065.0, + "step": 10280 + }, + { + "epoch": 2.780151433207139, + "grad_norm": 2.609375, + "learning_rate": 0.009809310509964198, + "loss": 2.939, + "mean_token_accuracy": 0.42273056507110596, + "num_tokens": 5256108245.0, + "step": 10281 + }, + { + "epoch": 2.7804218496484587, + "grad_norm": 3.296875, + "learning_rate": 0.009807747896569668, + "loss": 3.0807, + "mean_token_accuracy": 0.40347450971603394, + "num_tokens": 5256632478.0, + "step": 10282 + }, + { + "epoch": 2.7806922660897784, + "grad_norm": 3.78125, + "learning_rate": 0.009806185319756933, + "loss": 3.0023, + "mean_token_accuracy": 0.415485680103302, + "num_tokens": 5257156683.0, + "step": 10283 + }, + { + "epoch": 2.780962682531098, + "grad_norm": 2.578125, + "learning_rate": 0.009804622779573948, + "loss": 3.0029, + "mean_token_accuracy": 0.424314945936203, + "num_tokens": 5257670755.0, + "step": 10284 + }, + { + "epoch": 2.7812330989724177, + "grad_norm": 3.5, + "learning_rate": 0.009803060276068651, + "loss": 2.9637, + "mean_token_accuracy": 0.4109172821044922, + "num_tokens": 5258154868.0, + "step": 10285 + }, + { + "epoch": 2.7815035154137373, + "grad_norm": 2.765625, + "learning_rate": 0.009801497809288979, + "loss": 3.1507, + "mean_token_accuracy": 0.397603303194046, + "num_tokens": 5258655422.0, + "step": 10286 + }, + { + "epoch": 2.781773931855057, + "grad_norm": 3.546875, + "learning_rate": 0.009799935379282882, + "loss": 3.1629, + "mean_token_accuracy": 0.3912537097930908, + "num_tokens": 5259179675.0, + "step": 10287 + }, + { + "epoch": 2.7820443482963766, + "grad_norm": 2.9375, + "learning_rate": 0.0097983729860983, + "loss": 3.1646, + "mean_token_accuracy": 0.3925659656524658, + "num_tokens": 5259703751.0, + "step": 10288 + }, + { + "epoch": 2.782314764737696, + "grad_norm": 3.703125, + "learning_rate": 0.009796810629783166, + "loss": 3.0356, + "mean_token_accuracy": 0.3853626251220703, + "num_tokens": 5260186868.0, + "step": 10289 + }, + { + "epoch": 2.782585181179016, + "grad_norm": 2.625, + "learning_rate": 0.00979524831038542, + "loss": 3.0831, + "mean_token_accuracy": 0.40911543369293213, + "num_tokens": 5260711124.0, + "step": 10290 + }, + { + "epoch": 2.782855597620335, + "grad_norm": 42.75, + "learning_rate": 0.009793686027953002, + "loss": 10.317, + "mean_token_accuracy": 0.010638200677931309, + "num_tokens": 5261235400.0, + "step": 10291 + }, + { + "epoch": 2.783126014061655, + "grad_norm": 6.6875, + "learning_rate": 0.009792123782533838, + "loss": 3.2745, + "mean_token_accuracy": 0.3740292191505432, + "num_tokens": 5261759565.0, + "step": 10292 + }, + { + "epoch": 2.7833964305029744, + "grad_norm": 2.46875, + "learning_rate": 0.009790561574175873, + "loss": 3.0664, + "mean_token_accuracy": 0.4046633541584015, + "num_tokens": 5262279541.0, + "step": 10293 + }, + { + "epoch": 2.7836668469442944, + "grad_norm": 2.359375, + "learning_rate": 0.009788999402927035, + "loss": 3.15, + "mean_token_accuracy": 0.39429008960723877, + "num_tokens": 5262803725.0, + "step": 10294 + }, + { + "epoch": 2.7839372633856136, + "grad_norm": 3.046875, + "learning_rate": 0.009787437268835254, + "loss": 2.5976, + "mean_token_accuracy": 0.5089585185050964, + "num_tokens": 5263327989.0, + "step": 10295 + }, + { + "epoch": 2.7842076798269337, + "grad_norm": 2.75, + "learning_rate": 0.009785875171948469, + "loss": 3.0759, + "mean_token_accuracy": 0.40078598260879517, + "num_tokens": 5263852157.0, + "step": 10296 + }, + { + "epoch": 2.784478096268253, + "grad_norm": 3.359375, + "learning_rate": 0.0097843131123146, + "loss": 3.0529, + "mean_token_accuracy": 0.3900565505027771, + "num_tokens": 5264376380.0, + "step": 10297 + }, + { + "epoch": 2.7847485127095726, + "grad_norm": 3.421875, + "learning_rate": 0.009782751089981579, + "loss": 3.0587, + "mean_token_accuracy": 0.4041839838027954, + "num_tokens": 5264900596.0, + "step": 10298 + }, + { + "epoch": 2.785018929150892, + "grad_norm": 3.234375, + "learning_rate": 0.009781189104997336, + "loss": 3.0872, + "mean_token_accuracy": 0.3940434455871582, + "num_tokens": 5265382158.0, + "step": 10299 + }, + { + "epoch": 2.785289345592212, + "grad_norm": 2.96875, + "learning_rate": 0.009779627157409794, + "loss": 3.1495, + "mean_token_accuracy": 0.40600886940956116, + "num_tokens": 5265906254.0, + "step": 10300 + }, + { + "epoch": 2.7855597620335315, + "grad_norm": 3.3125, + "learning_rate": 0.009778065247266883, + "loss": 3.1391, + "mean_token_accuracy": 0.39744600653648376, + "num_tokens": 5266418117.0, + "step": 10301 + }, + { + "epoch": 2.785830178474851, + "grad_norm": 2.828125, + "learning_rate": 0.00977650337461652, + "loss": 3.0867, + "mean_token_accuracy": 0.4301055669784546, + "num_tokens": 5266877468.0, + "step": 10302 + }, + { + "epoch": 2.786100594916171, + "grad_norm": 2.953125, + "learning_rate": 0.009774941539506633, + "loss": 2.8777, + "mean_token_accuracy": 0.416322261095047, + "num_tokens": 5267401720.0, + "step": 10303 + }, + { + "epoch": 2.7863710113574904, + "grad_norm": 2.015625, + "learning_rate": 0.009773379741985144, + "loss": 3.0621, + "mean_token_accuracy": 0.40868711471557617, + "num_tokens": 5267925966.0, + "step": 10304 + }, + { + "epoch": 2.78664142779881, + "grad_norm": 2.640625, + "learning_rate": 0.009771817982099971, + "loss": 2.9187, + "mean_token_accuracy": 0.4223004877567291, + "num_tokens": 5268450037.0, + "step": 10305 + }, + { + "epoch": 2.7869118442401297, + "grad_norm": 2.5, + "learning_rate": 0.00977025625989903, + "loss": 3.1338, + "mean_token_accuracy": 0.39001893997192383, + "num_tokens": 5268974254.0, + "step": 10306 + }, + { + "epoch": 2.7871822606814494, + "grad_norm": 2.78125, + "learning_rate": 0.00976869457543025, + "loss": 2.8789, + "mean_token_accuracy": 0.4389132261276245, + "num_tokens": 5269450586.0, + "step": 10307 + }, + { + "epoch": 2.787452677122769, + "grad_norm": 2.328125, + "learning_rate": 0.009767132928741538, + "loss": 3.3584, + "mean_token_accuracy": 0.3932240903377533, + "num_tokens": 5269958845.0, + "step": 10308 + }, + { + "epoch": 2.7877230935640886, + "grad_norm": 7.71875, + "learning_rate": 0.009765571319880813, + "loss": 2.9859, + "mean_token_accuracy": 0.4224435091018677, + "num_tokens": 5270483018.0, + "step": 10309 + }, + { + "epoch": 2.7879935100054083, + "grad_norm": 2.265625, + "learning_rate": 0.009764009748895993, + "loss": 2.9579, + "mean_token_accuracy": 0.4274175763130188, + "num_tokens": 5270962837.0, + "step": 10310 + }, + { + "epoch": 2.788263926446728, + "grad_norm": 14.375, + "learning_rate": 0.009762448215834987, + "loss": 8.5936, + "mean_token_accuracy": 0.022950580343604088, + "num_tokens": 5271487039.0, + "step": 10311 + }, + { + "epoch": 2.7885343428880476, + "grad_norm": 4.375, + "learning_rate": 0.009760886720745706, + "loss": 3.0663, + "mean_token_accuracy": 0.37683624029159546, + "num_tokens": 5272011303.0, + "step": 10312 + }, + { + "epoch": 2.7888047593293672, + "grad_norm": 2.328125, + "learning_rate": 0.009759325263676068, + "loss": 2.9789, + "mean_token_accuracy": 0.415971040725708, + "num_tokens": 5272535414.0, + "step": 10313 + }, + { + "epoch": 2.789075175770687, + "grad_norm": 3.0625, + "learning_rate": 0.00975776384467398, + "loss": 3.0201, + "mean_token_accuracy": 0.3989350497722626, + "num_tokens": 5273034702.0, + "step": 10314 + }, + { + "epoch": 2.7893455922120065, + "grad_norm": 4.1875, + "learning_rate": 0.009756202463787347, + "loss": 3.1415, + "mean_token_accuracy": 0.41028928756713867, + "num_tokens": 5273549815.0, + "step": 10315 + }, + { + "epoch": 2.789616008653326, + "grad_norm": 3.3125, + "learning_rate": 0.009754641121064082, + "loss": 3.1711, + "mean_token_accuracy": 0.41521692276000977, + "num_tokens": 5274034536.0, + "step": 10316 + }, + { + "epoch": 2.789886425094646, + "grad_norm": 3.15625, + "learning_rate": 0.009753079816552093, + "loss": 2.9988, + "mean_token_accuracy": 0.4055503010749817, + "num_tokens": 5274558767.0, + "step": 10317 + }, + { + "epoch": 2.7901568415359654, + "grad_norm": 2.6875, + "learning_rate": 0.009751518550299277, + "loss": 3.1243, + "mean_token_accuracy": 0.3909054398536682, + "num_tokens": 5275082927.0, + "step": 10318 + }, + { + "epoch": 2.790427257977285, + "grad_norm": 2.828125, + "learning_rate": 0.009749957322353547, + "loss": 3.0518, + "mean_token_accuracy": 0.38868600130081177, + "num_tokens": 5275607194.0, + "step": 10319 + }, + { + "epoch": 2.7906976744186047, + "grad_norm": 2.890625, + "learning_rate": 0.009748396132762802, + "loss": 2.8438, + "mean_token_accuracy": 0.4198635220527649, + "num_tokens": 5276131468.0, + "step": 10320 + }, + { + "epoch": 2.7909680908599244, + "grad_norm": 3.359375, + "learning_rate": 0.00974683498157494, + "loss": 2.964, + "mean_token_accuracy": 0.4546852111816406, + "num_tokens": 5276538066.0, + "step": 10321 + }, + { + "epoch": 2.791238507301244, + "grad_norm": 3.421875, + "learning_rate": 0.00974527386883787, + "loss": 3.1309, + "mean_token_accuracy": 0.39254438877105713, + "num_tokens": 5277062289.0, + "step": 10322 + }, + { + "epoch": 2.7915089237425637, + "grad_norm": 3.03125, + "learning_rate": 0.009743712794599488, + "loss": 2.9836, + "mean_token_accuracy": 0.4103628993034363, + "num_tokens": 5277586567.0, + "step": 10323 + }, + { + "epoch": 2.7917793401838833, + "grad_norm": 2.90625, + "learning_rate": 0.00974215175890769, + "loss": 3.0971, + "mean_token_accuracy": 0.40332895517349243, + "num_tokens": 5278110841.0, + "step": 10324 + }, + { + "epoch": 2.792049756625203, + "grad_norm": 2.75, + "learning_rate": 0.009740590761810377, + "loss": 2.7249, + "mean_token_accuracy": 0.4550931751728058, + "num_tokens": 5278635016.0, + "step": 10325 + }, + { + "epoch": 2.7923201730665226, + "grad_norm": 3.015625, + "learning_rate": 0.009739029803355441, + "loss": 3.1397, + "mean_token_accuracy": 0.3957641124725342, + "num_tokens": 5279159234.0, + "step": 10326 + }, + { + "epoch": 2.7925905895078422, + "grad_norm": 2.828125, + "learning_rate": 0.009737468883590781, + "loss": 3.058, + "mean_token_accuracy": 0.42544952034950256, + "num_tokens": 5279620489.0, + "step": 10327 + }, + { + "epoch": 2.792861005949162, + "grad_norm": 4.21875, + "learning_rate": 0.009735908002564289, + "loss": 2.8432, + "mean_token_accuracy": 0.4518096446990967, + "num_tokens": 5280144504.0, + "step": 10328 + }, + { + "epoch": 2.7931314223904815, + "grad_norm": 1.859375, + "learning_rate": 0.009734347160323854, + "loss": 2.9867, + "mean_token_accuracy": 0.4240599274635315, + "num_tokens": 5280657170.0, + "step": 10329 + }, + { + "epoch": 2.7934018388318007, + "grad_norm": 3.421875, + "learning_rate": 0.009732786356917371, + "loss": 2.9076, + "mean_token_accuracy": 0.4125916659832001, + "num_tokens": 5281181397.0, + "step": 10330 + }, + { + "epoch": 2.793672255273121, + "grad_norm": 95.0, + "learning_rate": 0.009731225592392732, + "loss": 16.0602, + "mean_token_accuracy": 0.005296121351420879, + "num_tokens": 5281705540.0, + "step": 10331 + }, + { + "epoch": 2.79394267171444, + "grad_norm": 5.53125, + "learning_rate": 0.009729664866797818, + "loss": 3.294, + "mean_token_accuracy": 0.3620162010192871, + "num_tokens": 5282229817.0, + "step": 10332 + }, + { + "epoch": 2.79421308815576, + "grad_norm": 1.8046875, + "learning_rate": 0.009728104180180528, + "loss": 2.8794, + "mean_token_accuracy": 0.4285207688808441, + "num_tokens": 5282753925.0, + "step": 10333 + }, + { + "epoch": 2.7944835045970793, + "grad_norm": 2.953125, + "learning_rate": 0.009726543532588741, + "loss": 3.3261, + "mean_token_accuracy": 0.3881664276123047, + "num_tokens": 5283278201.0, + "step": 10334 + }, + { + "epoch": 2.7947539210383994, + "grad_norm": 3.25, + "learning_rate": 0.009724982924070338, + "loss": 3.0447, + "mean_token_accuracy": 0.394988477230072, + "num_tokens": 5283802362.0, + "step": 10335 + }, + { + "epoch": 2.7950243374797186, + "grad_norm": 2.71875, + "learning_rate": 0.009723422354673215, + "loss": 3.2231, + "mean_token_accuracy": 0.37739098072052, + "num_tokens": 5284326530.0, + "step": 10336 + }, + { + "epoch": 2.7952947539210387, + "grad_norm": 18.125, + "learning_rate": 0.009721861824445245, + "loss": 2.6492, + "mean_token_accuracy": 0.44025659561157227, + "num_tokens": 5284850776.0, + "step": 10337 + }, + { + "epoch": 2.795565170362358, + "grad_norm": 4.59375, + "learning_rate": 0.009720301333434309, + "loss": 3.3957, + "mean_token_accuracy": 0.3886488080024719, + "num_tokens": 5285374929.0, + "step": 10338 + }, + { + "epoch": 2.7958355868036775, + "grad_norm": 2.0625, + "learning_rate": 0.009718740881688297, + "loss": 3.1118, + "mean_token_accuracy": 0.41782087087631226, + "num_tokens": 5285849830.0, + "step": 10339 + }, + { + "epoch": 2.796106003244997, + "grad_norm": 3.0, + "learning_rate": 0.009717180469255083, + "loss": 3.1428, + "mean_token_accuracy": 0.4155501127243042, + "num_tokens": 5286351606.0, + "step": 10340 + }, + { + "epoch": 2.796376419686317, + "grad_norm": 3.296875, + "learning_rate": 0.009715620096182542, + "loss": 3.1877, + "mean_token_accuracy": 0.4066498875617981, + "num_tokens": 5286875839.0, + "step": 10341 + }, + { + "epoch": 2.7966468361276364, + "grad_norm": 4.09375, + "learning_rate": 0.009714059762518554, + "loss": 3.1963, + "mean_token_accuracy": 0.4060521125793457, + "num_tokens": 5287400016.0, + "step": 10342 + }, + { + "epoch": 2.796917252568956, + "grad_norm": 3.8125, + "learning_rate": 0.009712499468310995, + "loss": 3.1837, + "mean_token_accuracy": 0.40690720081329346, + "num_tokens": 5287924255.0, + "step": 10343 + }, + { + "epoch": 2.7971876690102757, + "grad_norm": 2.859375, + "learning_rate": 0.00971093921360774, + "loss": 3.2594, + "mean_token_accuracy": 0.3711031973361969, + "num_tokens": 5288387924.0, + "step": 10344 + }, + { + "epoch": 2.7974580854515954, + "grad_norm": 2.609375, + "learning_rate": 0.009709378998456657, + "loss": 3.0604, + "mean_token_accuracy": 0.40438923239707947, + "num_tokens": 5288911710.0, + "step": 10345 + }, + { + "epoch": 2.797728501892915, + "grad_norm": 2.90625, + "learning_rate": 0.009707818822905624, + "loss": 3.2271, + "mean_token_accuracy": 0.38936492800712585, + "num_tokens": 5289435799.0, + "step": 10346 + }, + { + "epoch": 2.7979989183342346, + "grad_norm": 3.5, + "learning_rate": 0.00970625868700251, + "loss": 3.0078, + "mean_token_accuracy": 0.39678338170051575, + "num_tokens": 5289955420.0, + "step": 10347 + }, + { + "epoch": 2.7982693347755543, + "grad_norm": 2.8125, + "learning_rate": 0.009704698590795185, + "loss": 3.0143, + "mean_token_accuracy": 0.4169059693813324, + "num_tokens": 5290479648.0, + "step": 10348 + }, + { + "epoch": 2.798539751216874, + "grad_norm": 25.25, + "learning_rate": 0.009703138534331513, + "loss": 3.2928, + "mean_token_accuracy": 0.4075051546096802, + "num_tokens": 5290946105.0, + "step": 10349 + }, + { + "epoch": 2.7988101676581936, + "grad_norm": 4.4375, + "learning_rate": 0.009701578517659369, + "loss": 3.3118, + "mean_token_accuracy": 0.4006848931312561, + "num_tokens": 5291371224.0, + "step": 10350 + }, + { + "epoch": 2.799080584099513, + "grad_norm": 16.875, + "learning_rate": 0.009700018540826612, + "loss": 11.1022, + "mean_token_accuracy": 0.013384022749960423, + "num_tokens": 5291895429.0, + "step": 10351 + }, + { + "epoch": 2.799351000540833, + "grad_norm": 4.28125, + "learning_rate": 0.009698458603881107, + "loss": 3.2009, + "mean_token_accuracy": 0.3827417492866516, + "num_tokens": 5292419615.0, + "step": 10352 + }, + { + "epoch": 2.7996214169821525, + "grad_norm": 2.28125, + "learning_rate": 0.009696898706870722, + "loss": 3.1975, + "mean_token_accuracy": 0.36915433406829834, + "num_tokens": 5292943651.0, + "step": 10353 + }, + { + "epoch": 2.799891833423472, + "grad_norm": 2.25, + "learning_rate": 0.009695338849843319, + "loss": 2.9208, + "mean_token_accuracy": 0.41855642199516296, + "num_tokens": 5293467854.0, + "step": 10354 + }, + { + "epoch": 2.800162249864792, + "grad_norm": 2.859375, + "learning_rate": 0.009693779032846753, + "loss": 3.3227, + "mean_token_accuracy": 0.4091700315475464, + "num_tokens": 5293892538.0, + "step": 10355 + }, + { + "epoch": 2.8004326663061114, + "grad_norm": 2.75, + "learning_rate": 0.009692219255928891, + "loss": 3.1869, + "mean_token_accuracy": 0.403192400932312, + "num_tokens": 5294416697.0, + "step": 10356 + }, + { + "epoch": 2.800703082747431, + "grad_norm": 3.328125, + "learning_rate": 0.009690659519137586, + "loss": 3.1262, + "mean_token_accuracy": 0.4015340209007263, + "num_tokens": 5294908140.0, + "step": 10357 + }, + { + "epoch": 2.8009734991887507, + "grad_norm": 2.640625, + "learning_rate": 0.009689099822520697, + "loss": 3.1132, + "mean_token_accuracy": 0.3945637345314026, + "num_tokens": 5295432355.0, + "step": 10358 + }, + { + "epoch": 2.8012439156300704, + "grad_norm": 2.953125, + "learning_rate": 0.009687540166126081, + "loss": 3.0314, + "mean_token_accuracy": 0.42267030477523804, + "num_tokens": 5295956554.0, + "step": 10359 + }, + { + "epoch": 2.80151433207139, + "grad_norm": 2.453125, + "learning_rate": 0.009685980550001595, + "loss": 3.1818, + "mean_token_accuracy": 0.417916864156723, + "num_tokens": 5296408680.0, + "step": 10360 + }, + { + "epoch": 2.8017847485127096, + "grad_norm": 2.65625, + "learning_rate": 0.009684420974195085, + "loss": 3.0449, + "mean_token_accuracy": 0.38970839977264404, + "num_tokens": 5296932874.0, + "step": 10361 + }, + { + "epoch": 2.8020551649540293, + "grad_norm": 2.96875, + "learning_rate": 0.009682861438754414, + "loss": 3.1623, + "mean_token_accuracy": 0.3963964283466339, + "num_tokens": 5297457151.0, + "step": 10362 + }, + { + "epoch": 2.802325581395349, + "grad_norm": 3.28125, + "learning_rate": 0.009681301943727426, + "loss": 2.8945, + "mean_token_accuracy": 0.41520872712135315, + "num_tokens": 5297957822.0, + "step": 10363 + }, + { + "epoch": 2.8025959978366686, + "grad_norm": 2.96875, + "learning_rate": 0.00967974248916197, + "loss": 3.0136, + "mean_token_accuracy": 0.4052461087703705, + "num_tokens": 5298452561.0, + "step": 10364 + }, + { + "epoch": 2.802866414277988, + "grad_norm": 3.046875, + "learning_rate": 0.009678183075105895, + "loss": 3.1102, + "mean_token_accuracy": 0.42483025789260864, + "num_tokens": 5298976828.0, + "step": 10365 + }, + { + "epoch": 2.803136830719308, + "grad_norm": 3.109375, + "learning_rate": 0.009676623701607058, + "loss": 3.0343, + "mean_token_accuracy": 0.4304668605327606, + "num_tokens": 5299501091.0, + "step": 10366 + }, + { + "epoch": 2.8034072471606275, + "grad_norm": 2.953125, + "learning_rate": 0.00967506436871329, + "loss": 3.11, + "mean_token_accuracy": 0.41240575909614563, + "num_tokens": 5299962602.0, + "step": 10367 + }, + { + "epoch": 2.803677663601947, + "grad_norm": 2.59375, + "learning_rate": 0.009673505076472449, + "loss": 2.8958, + "mean_token_accuracy": 0.4335484504699707, + "num_tokens": 5300484457.0, + "step": 10368 + }, + { + "epoch": 2.803948080043267, + "grad_norm": 2.359375, + "learning_rate": 0.009671945824932367, + "loss": 2.9346, + "mean_token_accuracy": 0.4283129572868347, + "num_tokens": 5301008696.0, + "step": 10369 + }, + { + "epoch": 2.8042184964845864, + "grad_norm": 2.46875, + "learning_rate": 0.009670386614140897, + "loss": 2.9046, + "mean_token_accuracy": 0.4098285138607025, + "num_tokens": 5301532776.0, + "step": 10370 + }, + { + "epoch": 2.8044889129259056, + "grad_norm": 9.1875, + "learning_rate": 0.00966882744414588, + "loss": 10.6378, + "mean_token_accuracy": 0.00014497975644189864, + "num_tokens": 5302027458.0, + "step": 10371 + }, + { + "epoch": 2.8047593293672257, + "grad_norm": 8.3125, + "learning_rate": 0.009667268314995146, + "loss": 3.4399, + "mean_token_accuracy": 0.3693571388721466, + "num_tokens": 5302543734.0, + "step": 10372 + }, + { + "epoch": 2.805029745808545, + "grad_norm": 3.59375, + "learning_rate": 0.009665709226736539, + "loss": 3.1322, + "mean_token_accuracy": 0.41937339305877686, + "num_tokens": 5303030894.0, + "step": 10373 + }, + { + "epoch": 2.805300162249865, + "grad_norm": 3.046875, + "learning_rate": 0.009664150179417899, + "loss": 3.1026, + "mean_token_accuracy": 0.4030943214893341, + "num_tokens": 5303486032.0, + "step": 10374 + }, + { + "epoch": 2.805570578691184, + "grad_norm": 3.3125, + "learning_rate": 0.009662591173087058, + "loss": 3.1318, + "mean_token_accuracy": 0.4016566276550293, + "num_tokens": 5304010262.0, + "step": 10375 + }, + { + "epoch": 2.8058409951325043, + "grad_norm": 2.84375, + "learning_rate": 0.009661032207791855, + "loss": 2.9507, + "mean_token_accuracy": 0.41266292333602905, + "num_tokens": 5304501840.0, + "step": 10376 + }, + { + "epoch": 2.8061114115738235, + "grad_norm": 2.96875, + "learning_rate": 0.009659473283580123, + "loss": 2.8708, + "mean_token_accuracy": 0.41129070520401, + "num_tokens": 5305025991.0, + "step": 10377 + }, + { + "epoch": 2.8063818280151436, + "grad_norm": 2.421875, + "learning_rate": 0.009657914400499687, + "loss": 3.0552, + "mean_token_accuracy": 0.4208972454071045, + "num_tokens": 5305550245.0, + "step": 10378 + }, + { + "epoch": 2.8066522444564628, + "grad_norm": 3.796875, + "learning_rate": 0.009656355558598392, + "loss": 3.3537, + "mean_token_accuracy": 0.39060184359550476, + "num_tokens": 5306036699.0, + "step": 10379 + }, + { + "epoch": 2.806922660897783, + "grad_norm": 2.765625, + "learning_rate": 0.009654796757924055, + "loss": 3.0068, + "mean_token_accuracy": 0.40485748648643494, + "num_tokens": 5306560930.0, + "step": 10380 + }, + { + "epoch": 2.807193077339102, + "grad_norm": 3.5625, + "learning_rate": 0.009653237998524506, + "loss": 3.0007, + "mean_token_accuracy": 0.4019913673400879, + "num_tokens": 5307085171.0, + "step": 10381 + }, + { + "epoch": 2.8074634937804217, + "grad_norm": 2.859375, + "learning_rate": 0.009651679280447583, + "loss": 2.9848, + "mean_token_accuracy": 0.41745805740356445, + "num_tokens": 5307588284.0, + "step": 10382 + }, + { + "epoch": 2.8077339102217413, + "grad_norm": 2.84375, + "learning_rate": 0.009650120603741104, + "loss": 3.1383, + "mean_token_accuracy": 0.41450804471969604, + "num_tokens": 5308112369.0, + "step": 10383 + }, + { + "epoch": 2.808004326663061, + "grad_norm": 2.671875, + "learning_rate": 0.009648561968452889, + "loss": 3.1207, + "mean_token_accuracy": 0.40461230278015137, + "num_tokens": 5308636454.0, + "step": 10384 + }, + { + "epoch": 2.8082747431043806, + "grad_norm": 3.171875, + "learning_rate": 0.009647003374630772, + "loss": 2.8721, + "mean_token_accuracy": 0.42661812901496887, + "num_tokens": 5309160597.0, + "step": 10385 + }, + { + "epoch": 2.8085451595457003, + "grad_norm": 3.5, + "learning_rate": 0.009645444822322569, + "loss": 3.3018, + "mean_token_accuracy": 0.36822599172592163, + "num_tokens": 5309684874.0, + "step": 10386 + }, + { + "epoch": 2.80881557598702, + "grad_norm": 3.109375, + "learning_rate": 0.0096438863115761, + "loss": 2.9754, + "mean_token_accuracy": 0.4188573658466339, + "num_tokens": 5310112962.0, + "step": 10387 + }, + { + "epoch": 2.8090859924283396, + "grad_norm": 3.03125, + "learning_rate": 0.009642327842439192, + "loss": 3.0601, + "mean_token_accuracy": 0.40829989314079285, + "num_tokens": 5310637171.0, + "step": 10388 + }, + { + "epoch": 2.809356408869659, + "grad_norm": 3.40625, + "learning_rate": 0.009640769414959656, + "loss": 3.138, + "mean_token_accuracy": 0.3962130546569824, + "num_tokens": 5311161426.0, + "step": 10389 + }, + { + "epoch": 2.809626825310979, + "grad_norm": 2.8125, + "learning_rate": 0.009639211029185309, + "loss": 3.0877, + "mean_token_accuracy": 0.3733111619949341, + "num_tokens": 5311685677.0, + "step": 10390 + }, + { + "epoch": 2.8098972417522985, + "grad_norm": 23.125, + "learning_rate": 0.009637652685163974, + "loss": 10.8284, + "mean_token_accuracy": 0.008170107379555702, + "num_tokens": 5312146703.0, + "step": 10391 + }, + { + "epoch": 2.810167658193618, + "grad_norm": 6.625, + "learning_rate": 0.009636094382943454, + "loss": 3.3223, + "mean_token_accuracy": 0.40726611018180847, + "num_tokens": 5312670900.0, + "step": 10392 + }, + { + "epoch": 2.8104380746349378, + "grad_norm": 2.421875, + "learning_rate": 0.009634536122571575, + "loss": 3.053, + "mean_token_accuracy": 0.3911478519439697, + "num_tokens": 5313195143.0, + "step": 10393 + }, + { + "epoch": 2.8107084910762574, + "grad_norm": 3.21875, + "learning_rate": 0.009632977904096143, + "loss": 3.3093, + "mean_token_accuracy": 0.40252724289894104, + "num_tokens": 5313659811.0, + "step": 10394 + }, + { + "epoch": 2.810978907517577, + "grad_norm": 3.609375, + "learning_rate": 0.009631419727564967, + "loss": 3.1334, + "mean_token_accuracy": 0.4085802733898163, + "num_tokens": 5314184026.0, + "step": 10395 + }, + { + "epoch": 2.8112493239588967, + "grad_norm": 2.859375, + "learning_rate": 0.009629861593025861, + "loss": 3.03, + "mean_token_accuracy": 0.40188074111938477, + "num_tokens": 5314708278.0, + "step": 10396 + }, + { + "epoch": 2.8115197404002163, + "grad_norm": 2.78125, + "learning_rate": 0.009628303500526632, + "loss": 3.1446, + "mean_token_accuracy": 0.4093242883682251, + "num_tokens": 5315232441.0, + "step": 10397 + }, + { + "epoch": 2.811790156841536, + "grad_norm": 3.0, + "learning_rate": 0.00962674545011508, + "loss": 3.2146, + "mean_token_accuracy": 0.405191570520401, + "num_tokens": 5315756646.0, + "step": 10398 + }, + { + "epoch": 2.8120605732828556, + "grad_norm": 3.09375, + "learning_rate": 0.00962518744183902, + "loss": 3.0724, + "mean_token_accuracy": 0.42546749114990234, + "num_tokens": 5316216170.0, + "step": 10399 + }, + { + "epoch": 2.8123309897241753, + "grad_norm": 3.171875, + "learning_rate": 0.009623629475746251, + "loss": 2.8655, + "mean_token_accuracy": 0.4488784074783325, + "num_tokens": 5316740317.0, + "step": 10400 + }, + { + "epoch": 2.812601406165495, + "grad_norm": 2.828125, + "learning_rate": 0.009622071551884577, + "loss": 2.9525, + "mean_token_accuracy": 0.41509050130844116, + "num_tokens": 5317264385.0, + "step": 10401 + }, + { + "epoch": 2.8128718226068146, + "grad_norm": 2.53125, + "learning_rate": 0.0096205136703018, + "loss": 2.7437, + "mean_token_accuracy": 0.4256405532360077, + "num_tokens": 5317788620.0, + "step": 10402 + }, + { + "epoch": 2.813142239048134, + "grad_norm": 2.421875, + "learning_rate": 0.009618955831045724, + "loss": 2.9664, + "mean_token_accuracy": 0.4290061891078949, + "num_tokens": 5318255138.0, + "step": 10403 + }, + { + "epoch": 2.813412655489454, + "grad_norm": 2.875, + "learning_rate": 0.00961739803416414, + "loss": 3.04, + "mean_token_accuracy": 0.39192521572113037, + "num_tokens": 5318779268.0, + "step": 10404 + }, + { + "epoch": 2.8136830719307735, + "grad_norm": 2.53125, + "learning_rate": 0.009615840279704854, + "loss": 2.9651, + "mean_token_accuracy": 0.412614643573761, + "num_tokens": 5319287193.0, + "step": 10405 + }, + { + "epoch": 2.813953488372093, + "grad_norm": 2.625, + "learning_rate": 0.009614282567715658, + "loss": 3.139, + "mean_token_accuracy": 0.40589791536331177, + "num_tokens": 5319756413.0, + "step": 10406 + }, + { + "epoch": 2.814223904813413, + "grad_norm": 3.421875, + "learning_rate": 0.009612724898244345, + "loss": 2.9639, + "mean_token_accuracy": 0.45317208766937256, + "num_tokens": 5320280605.0, + "step": 10407 + }, + { + "epoch": 2.8144943212547324, + "grad_norm": 3.125, + "learning_rate": 0.009611167271338712, + "loss": 2.835, + "mean_token_accuracy": 0.41745567321777344, + "num_tokens": 5320775805.0, + "step": 10408 + }, + { + "epoch": 2.814764737696052, + "grad_norm": 3.546875, + "learning_rate": 0.009609609687046555, + "loss": 3.0087, + "mean_token_accuracy": 0.3542836904525757, + "num_tokens": 5321299955.0, + "step": 10409 + }, + { + "epoch": 2.8150351541373717, + "grad_norm": 3.03125, + "learning_rate": 0.009608052145415658, + "loss": 2.7744, + "mean_token_accuracy": 0.43444955348968506, + "num_tokens": 5321738825.0, + "step": 10410 + }, + { + "epoch": 2.8153055705786914, + "grad_norm": 114.5, + "learning_rate": 0.009606494646493817, + "loss": 11.6502, + "mean_token_accuracy": 4.931103831040673e-05, + "num_tokens": 5322262955.0, + "step": 10411 + }, + { + "epoch": 2.8155759870200106, + "grad_norm": 7.0625, + "learning_rate": 0.00960493719032882, + "loss": 3.2911, + "mean_token_accuracy": 0.37517058849334717, + "num_tokens": 5322729584.0, + "step": 10412 + }, + { + "epoch": 2.8158464034613306, + "grad_norm": 2.28125, + "learning_rate": 0.009603379776968447, + "loss": 3.0688, + "mean_token_accuracy": 0.4206126630306244, + "num_tokens": 5323253787.0, + "step": 10413 + }, + { + "epoch": 2.81611681990265, + "grad_norm": 3.765625, + "learning_rate": 0.009601822406460495, + "loss": 3.2075, + "mean_token_accuracy": 0.41423118114471436, + "num_tokens": 5323778068.0, + "step": 10414 + }, + { + "epoch": 2.81638723634397, + "grad_norm": 3.421875, + "learning_rate": 0.009600265078852737, + "loss": 3.0852, + "mean_token_accuracy": 0.4056542217731476, + "num_tokens": 5324302325.0, + "step": 10415 + }, + { + "epoch": 2.816657652785289, + "grad_norm": 2.859375, + "learning_rate": 0.009598707794192966, + "loss": 2.9707, + "mean_token_accuracy": 0.4079415500164032, + "num_tokens": 5324826498.0, + "step": 10416 + }, + { + "epoch": 2.816928069226609, + "grad_norm": 2.53125, + "learning_rate": 0.009597150552528964, + "loss": 2.9946, + "mean_token_accuracy": 0.4048443138599396, + "num_tokens": 5325350782.0, + "step": 10417 + }, + { + "epoch": 2.8171984856679284, + "grad_norm": 2.21875, + "learning_rate": 0.009595593353908504, + "loss": 2.9715, + "mean_token_accuracy": 0.43073344230651855, + "num_tokens": 5325874218.0, + "step": 10418 + }, + { + "epoch": 2.8174689021092485, + "grad_norm": 2.859375, + "learning_rate": 0.009594036198379374, + "loss": 3.0387, + "mean_token_accuracy": 0.40717512369155884, + "num_tokens": 5326398398.0, + "step": 10419 + }, + { + "epoch": 2.8177393185505677, + "grad_norm": 2.71875, + "learning_rate": 0.009592479085989347, + "loss": 2.988, + "mean_token_accuracy": 0.4118516445159912, + "num_tokens": 5326922520.0, + "step": 10420 + }, + { + "epoch": 2.818009734991888, + "grad_norm": 3.0625, + "learning_rate": 0.0095909220167862, + "loss": 3.0497, + "mean_token_accuracy": 0.4222336709499359, + "num_tokens": 5327446798.0, + "step": 10421 + }, + { + "epoch": 2.818280151433207, + "grad_norm": 2.96875, + "learning_rate": 0.009589364990817709, + "loss": 3.1223, + "mean_token_accuracy": 0.40824663639068604, + "num_tokens": 5327970954.0, + "step": 10422 + }, + { + "epoch": 2.8185505678745266, + "grad_norm": 3.5, + "learning_rate": 0.00958780800813165, + "loss": 2.9851, + "mean_token_accuracy": 0.4236050546169281, + "num_tokens": 5328495140.0, + "step": 10423 + }, + { + "epoch": 2.8188209843158463, + "grad_norm": 2.96875, + "learning_rate": 0.009586251068775791, + "loss": 2.956, + "mean_token_accuracy": 0.4124634563922882, + "num_tokens": 5329019259.0, + "step": 10424 + }, + { + "epoch": 2.819091400757166, + "grad_norm": 3.046875, + "learning_rate": 0.00958469417279791, + "loss": 3.0272, + "mean_token_accuracy": 0.40904325246810913, + "num_tokens": 5329504095.0, + "step": 10425 + }, + { + "epoch": 2.8193618171984856, + "grad_norm": 2.703125, + "learning_rate": 0.009583137320245775, + "loss": 3.221, + "mean_token_accuracy": 0.3917125463485718, + "num_tokens": 5330028240.0, + "step": 10426 + }, + { + "epoch": 2.819632233639805, + "grad_norm": 2.203125, + "learning_rate": 0.00958158051116715, + "loss": 2.8317, + "mean_token_accuracy": 0.4359729588031769, + "num_tokens": 5330517991.0, + "step": 10427 + }, + { + "epoch": 2.819902650081125, + "grad_norm": 2.5625, + "learning_rate": 0.00958002374560981, + "loss": 2.8108, + "mean_token_accuracy": 0.4178885221481323, + "num_tokens": 5331036295.0, + "step": 10428 + }, + { + "epoch": 2.8201730665224445, + "grad_norm": 2.859375, + "learning_rate": 0.009578467023621513, + "loss": 2.7799, + "mean_token_accuracy": 0.42321962118148804, + "num_tokens": 5331560569.0, + "step": 10429 + }, + { + "epoch": 2.820443482963764, + "grad_norm": 2.390625, + "learning_rate": 0.00957691034525003, + "loss": 3.0747, + "mean_token_accuracy": 0.4178146719932556, + "num_tokens": 5332077555.0, + "step": 10430 + }, + { + "epoch": 2.8207138994050838, + "grad_norm": 7.21875, + "learning_rate": 0.009575353710543124, + "loss": 9.8736, + "mean_token_accuracy": 0.006923270411789417, + "num_tokens": 5332601825.0, + "step": 10431 + }, + { + "epoch": 2.8209843158464034, + "grad_norm": 6.875, + "learning_rate": 0.009573797119548557, + "loss": 3.4576, + "mean_token_accuracy": 0.37804296612739563, + "num_tokens": 5333109051.0, + "step": 10432 + }, + { + "epoch": 2.821254732287723, + "grad_norm": 2.578125, + "learning_rate": 0.009572240572314084, + "loss": 3.0184, + "mean_token_accuracy": 0.4042981266975403, + "num_tokens": 5333629761.0, + "step": 10433 + }, + { + "epoch": 2.8215251487290427, + "grad_norm": 2.046875, + "learning_rate": 0.009570684068887472, + "loss": 3.1284, + "mean_token_accuracy": 0.39605242013931274, + "num_tokens": 5334154040.0, + "step": 10434 + }, + { + "epoch": 2.8217955651703623, + "grad_norm": 4.03125, + "learning_rate": 0.009569127609316478, + "loss": 3.1681, + "mean_token_accuracy": 0.4106792211532593, + "num_tokens": 5334678268.0, + "step": 10435 + }, + { + "epoch": 2.822065981611682, + "grad_norm": 2.578125, + "learning_rate": 0.009567571193648853, + "loss": 3.0221, + "mean_token_accuracy": 0.4031677842140198, + "num_tokens": 5335202480.0, + "step": 10436 + }, + { + "epoch": 2.8223363980530016, + "grad_norm": 3.15625, + "learning_rate": 0.009566014821932357, + "loss": 2.9207, + "mean_token_accuracy": 0.38071155548095703, + "num_tokens": 5335726731.0, + "step": 10437 + }, + { + "epoch": 2.8226068144943213, + "grad_norm": 2.65625, + "learning_rate": 0.00956445849421474, + "loss": 3.1309, + "mean_token_accuracy": 0.41391050815582275, + "num_tokens": 5336227186.0, + "step": 10438 + }, + { + "epoch": 2.822877230935641, + "grad_norm": 3.421875, + "learning_rate": 0.009562902210543765, + "loss": 2.9708, + "mean_token_accuracy": 0.4276771545410156, + "num_tokens": 5336751434.0, + "step": 10439 + }, + { + "epoch": 2.8231476473769606, + "grad_norm": 3.109375, + "learning_rate": 0.009561345970967175, + "loss": 3.0062, + "mean_token_accuracy": 0.4390455186367035, + "num_tokens": 5337275693.0, + "step": 10440 + }, + { + "epoch": 2.82341806381828, + "grad_norm": 3.3125, + "learning_rate": 0.009559789775532717, + "loss": 3.0487, + "mean_token_accuracy": 0.41026076674461365, + "num_tokens": 5337786647.0, + "step": 10441 + }, + { + "epoch": 2.8236884802596, + "grad_norm": 2.609375, + "learning_rate": 0.009558233624288148, + "loss": 2.8719, + "mean_token_accuracy": 0.42139965295791626, + "num_tokens": 5338310822.0, + "step": 10442 + }, + { + "epoch": 2.8239588967009195, + "grad_norm": 2.25, + "learning_rate": 0.009556677517281212, + "loss": 2.8919, + "mean_token_accuracy": 0.43081146478652954, + "num_tokens": 5338835004.0, + "step": 10443 + }, + { + "epoch": 2.824229313142239, + "grad_norm": 2.671875, + "learning_rate": 0.00955512145455965, + "loss": 3.0975, + "mean_token_accuracy": 0.4088325798511505, + "num_tokens": 5339359185.0, + "step": 10444 + }, + { + "epoch": 2.8244997295835588, + "grad_norm": 3.078125, + "learning_rate": 0.009553565436171214, + "loss": 3.0552, + "mean_token_accuracy": 0.40745288133621216, + "num_tokens": 5339857522.0, + "step": 10445 + }, + { + "epoch": 2.8247701460248784, + "grad_norm": 2.8125, + "learning_rate": 0.009552009462163644, + "loss": 2.9147, + "mean_token_accuracy": 0.431184321641922, + "num_tokens": 5340381796.0, + "step": 10446 + }, + { + "epoch": 2.825040562466198, + "grad_norm": 3.25, + "learning_rate": 0.009550453532584682, + "loss": 3.3438, + "mean_token_accuracy": 0.3747802972793579, + "num_tokens": 5340905982.0, + "step": 10447 + }, + { + "epoch": 2.8253109789075177, + "grad_norm": 3.359375, + "learning_rate": 0.009548897647482069, + "loss": 3.0729, + "mean_token_accuracy": 0.4275650978088379, + "num_tokens": 5341368647.0, + "step": 10448 + }, + { + "epoch": 2.8255813953488373, + "grad_norm": 3.015625, + "learning_rate": 0.009547341806903545, + "loss": 3.1327, + "mean_token_accuracy": 0.4131612777709961, + "num_tokens": 5341892918.0, + "step": 10449 + }, + { + "epoch": 2.825851811790157, + "grad_norm": 2.78125, + "learning_rate": 0.00954578601089684, + "loss": 2.9402, + "mean_token_accuracy": 0.4108341932296753, + "num_tokens": 5342417102.0, + "step": 10450 + }, + { + "epoch": 2.8261222282314766, + "grad_norm": 10.625, + "learning_rate": 0.009544230259509701, + "loss": 9.6005, + "mean_token_accuracy": 0.011391941457986832, + "num_tokens": 5342941291.0, + "step": 10451 + }, + { + "epoch": 2.8263926446727963, + "grad_norm": 6.65625, + "learning_rate": 0.00954267455278986, + "loss": 3.1542, + "mean_token_accuracy": 0.39342591166496277, + "num_tokens": 5343465456.0, + "step": 10452 + }, + { + "epoch": 2.8266630611141155, + "grad_norm": 2.578125, + "learning_rate": 0.009541118890785046, + "loss": 3.1413, + "mean_token_accuracy": 0.4055044651031494, + "num_tokens": 5343989723.0, + "step": 10453 + }, + { + "epoch": 2.8269334775554356, + "grad_norm": 3.3125, + "learning_rate": 0.009539563273542996, + "loss": 2.8931, + "mean_token_accuracy": 0.4066370129585266, + "num_tokens": 5344513943.0, + "step": 10454 + }, + { + "epoch": 2.8272038939967548, + "grad_norm": 3.921875, + "learning_rate": 0.009538007701111444, + "loss": 2.9378, + "mean_token_accuracy": 0.38839268684387207, + "num_tokens": 5345038038.0, + "step": 10455 + }, + { + "epoch": 2.827474310438075, + "grad_norm": 2.71875, + "learning_rate": 0.009536452173538108, + "loss": 3.0227, + "mean_token_accuracy": 0.4273748993873596, + "num_tokens": 5345532121.0, + "step": 10456 + }, + { + "epoch": 2.827744726879394, + "grad_norm": 3.96875, + "learning_rate": 0.00953489669087073, + "loss": 2.8857, + "mean_token_accuracy": 0.4204690456390381, + "num_tokens": 5345998637.0, + "step": 10457 + }, + { + "epoch": 2.828015143320714, + "grad_norm": 3.0625, + "learning_rate": 0.009533341253157026, + "loss": 2.9225, + "mean_token_accuracy": 0.4094832241535187, + "num_tokens": 5346522745.0, + "step": 10458 + }, + { + "epoch": 2.8282855597620333, + "grad_norm": 3.0625, + "learning_rate": 0.009531785860444723, + "loss": 3.0609, + "mean_token_accuracy": 0.4104020595550537, + "num_tokens": 5347033214.0, + "step": 10459 + }, + { + "epoch": 2.8285559762033534, + "grad_norm": 2.078125, + "learning_rate": 0.009530230512781554, + "loss": 2.8023, + "mean_token_accuracy": 0.41081875562667847, + "num_tokens": 5347557369.0, + "step": 10460 + }, + { + "epoch": 2.8288263926446726, + "grad_norm": 28.75, + "learning_rate": 0.00952867521021523, + "loss": 3.2382, + "mean_token_accuracy": 0.40299612283706665, + "num_tokens": 5348052854.0, + "step": 10461 + }, + { + "epoch": 2.8290968090859927, + "grad_norm": 3.765625, + "learning_rate": 0.009527119952793482, + "loss": 3.3431, + "mean_token_accuracy": 0.37676936388015747, + "num_tokens": 5348577096.0, + "step": 10462 + }, + { + "epoch": 2.829367225527312, + "grad_norm": 2.421875, + "learning_rate": 0.009525564740564024, + "loss": 3.0805, + "mean_token_accuracy": 0.4584871530532837, + "num_tokens": 5349035884.0, + "step": 10463 + }, + { + "epoch": 2.8296376419686315, + "grad_norm": 2.890625, + "learning_rate": 0.009524009573574573, + "loss": 3.1256, + "mean_token_accuracy": 0.3988161087036133, + "num_tokens": 5349560162.0, + "step": 10464 + }, + { + "epoch": 2.829908058409951, + "grad_norm": 2.140625, + "learning_rate": 0.009522454451872848, + "loss": 2.9123, + "mean_token_accuracy": 0.4100038409233093, + "num_tokens": 5350084337.0, + "step": 10465 + }, + { + "epoch": 2.830178474851271, + "grad_norm": 2.109375, + "learning_rate": 0.009520899375506571, + "loss": 2.8887, + "mean_token_accuracy": 0.43786418437957764, + "num_tokens": 5350545308.0, + "step": 10466 + }, + { + "epoch": 2.8304488912925905, + "grad_norm": 2.453125, + "learning_rate": 0.009519344344523447, + "loss": 3.0682, + "mean_token_accuracy": 0.42180126905441284, + "num_tokens": 5351058280.0, + "step": 10467 + }, + { + "epoch": 2.83071930773391, + "grad_norm": 2.203125, + "learning_rate": 0.009517789358971194, + "loss": 2.8982, + "mean_token_accuracy": 0.423200786113739, + "num_tokens": 5351582552.0, + "step": 10468 + }, + { + "epoch": 2.8309897241752298, + "grad_norm": 2.359375, + "learning_rate": 0.009516234418897524, + "loss": 3.1652, + "mean_token_accuracy": 0.41567009687423706, + "num_tokens": 5352030518.0, + "step": 10469 + }, + { + "epoch": 2.8312601406165494, + "grad_norm": 3.734375, + "learning_rate": 0.009514679524350141, + "loss": 3.1316, + "mean_token_accuracy": 0.4200613498687744, + "num_tokens": 5352554698.0, + "step": 10470 + }, + { + "epoch": 2.831530557057869, + "grad_norm": 25.75, + "learning_rate": 0.009513124675376763, + "loss": 10.4632, + "mean_token_accuracy": 0.0074845170602202415, + "num_tokens": 5353078953.0, + "step": 10471 + }, + { + "epoch": 2.8318009734991887, + "grad_norm": 11.9375, + "learning_rate": 0.009511569872025092, + "loss": 3.5417, + "mean_token_accuracy": 0.33484309911727905, + "num_tokens": 5353563999.0, + "step": 10472 + }, + { + "epoch": 2.8320713899405083, + "grad_norm": 2.765625, + "learning_rate": 0.009510015114342829, + "loss": 3.0067, + "mean_token_accuracy": 0.39795807003974915, + "num_tokens": 5354088085.0, + "step": 10473 + }, + { + "epoch": 2.832341806381828, + "grad_norm": 8.125, + "learning_rate": 0.00950846040237769, + "loss": 3.0501, + "mean_token_accuracy": 0.4177256226539612, + "num_tokens": 5354612333.0, + "step": 10474 + }, + { + "epoch": 2.8326122228231476, + "grad_norm": 3.0, + "learning_rate": 0.009506905736177372, + "loss": 2.9145, + "mean_token_accuracy": 0.41903677582740784, + "num_tokens": 5355136383.0, + "step": 10475 + }, + { + "epoch": 2.8328826392644673, + "grad_norm": 2.171875, + "learning_rate": 0.009505351115789573, + "loss": 3.2324, + "mean_token_accuracy": 0.413724422454834, + "num_tokens": 5355562510.0, + "step": 10476 + }, + { + "epoch": 2.833153055705787, + "grad_norm": 2.375, + "learning_rate": 0.009503796541262002, + "loss": 2.923, + "mean_token_accuracy": 0.4125756025314331, + "num_tokens": 5356086646.0, + "step": 10477 + }, + { + "epoch": 2.8334234721471065, + "grad_norm": 2.84375, + "learning_rate": 0.00950224201264235, + "loss": 3.1671, + "mean_token_accuracy": 0.4081123471260071, + "num_tokens": 5356610880.0, + "step": 10478 + }, + { + "epoch": 2.833693888588426, + "grad_norm": 3.015625, + "learning_rate": 0.009500687529978317, + "loss": 3.1303, + "mean_token_accuracy": 0.3990088105201721, + "num_tokens": 5357134960.0, + "step": 10479 + }, + { + "epoch": 2.833964305029746, + "grad_norm": 3.40625, + "learning_rate": 0.009499133093317602, + "loss": 3.008, + "mean_token_accuracy": 0.40965306758880615, + "num_tokens": 5357659173.0, + "step": 10480 + }, + { + "epoch": 2.8342347214710655, + "grad_norm": 2.9375, + "learning_rate": 0.009497578702707898, + "loss": 2.9512, + "mean_token_accuracy": 0.4046792685985565, + "num_tokens": 5358183439.0, + "step": 10481 + }, + { + "epoch": 2.834505137912385, + "grad_norm": 2.8125, + "learning_rate": 0.009496024358196893, + "loss": 2.9161, + "mean_token_accuracy": 0.4206753373146057, + "num_tokens": 5358707640.0, + "step": 10482 + }, + { + "epoch": 2.8347755543537048, + "grad_norm": 2.90625, + "learning_rate": 0.00949447005983229, + "loss": 2.9467, + "mean_token_accuracy": 0.4099839925765991, + "num_tokens": 5359231799.0, + "step": 10483 + }, + { + "epoch": 2.8350459707950244, + "grad_norm": 3.0625, + "learning_rate": 0.009492915807661767, + "loss": 3.1183, + "mean_token_accuracy": 0.4226362109184265, + "num_tokens": 5359720436.0, + "step": 10484 + }, + { + "epoch": 2.835316387236344, + "grad_norm": 2.796875, + "learning_rate": 0.009491361601733025, + "loss": 2.8577, + "mean_token_accuracy": 0.40141043066978455, + "num_tokens": 5360244481.0, + "step": 10485 + }, + { + "epoch": 2.8355868036776637, + "grad_norm": 3.265625, + "learning_rate": 0.009489807442093741, + "loss": 3.2, + "mean_token_accuracy": 0.40474361181259155, + "num_tokens": 5360768669.0, + "step": 10486 + }, + { + "epoch": 2.8358572201189833, + "grad_norm": 3.515625, + "learning_rate": 0.009488253328791605, + "loss": 3.0772, + "mean_token_accuracy": 0.4135008156299591, + "num_tokens": 5361215121.0, + "step": 10487 + }, + { + "epoch": 2.836127636560303, + "grad_norm": 3.125, + "learning_rate": 0.009486699261874308, + "loss": 2.9031, + "mean_token_accuracy": 0.40435630083084106, + "num_tokens": 5361739316.0, + "step": 10488 + }, + { + "epoch": 2.8363980530016226, + "grad_norm": 3.4375, + "learning_rate": 0.009485145241389525, + "loss": 3.2005, + "mean_token_accuracy": 0.41663655638694763, + "num_tokens": 5362208500.0, + "step": 10489 + }, + { + "epoch": 2.8366684694429423, + "grad_norm": 3.765625, + "learning_rate": 0.009483591267384938, + "loss": 3.1154, + "mean_token_accuracy": 0.3980623781681061, + "num_tokens": 5362698383.0, + "step": 10490 + }, + { + "epoch": 2.836938885884262, + "grad_norm": 95.0, + "learning_rate": 0.009482037339908237, + "loss": 14.6576, + "mean_token_accuracy": 0.0035556345246732235, + "num_tokens": 5363222641.0, + "step": 10491 + }, + { + "epoch": 2.8372093023255816, + "grad_norm": 5.875, + "learning_rate": 0.00948048345900709, + "loss": 3.3038, + "mean_token_accuracy": 0.3867943584918976, + "num_tokens": 5363699459.0, + "step": 10492 + }, + { + "epoch": 2.837479718766901, + "grad_norm": 2.09375, + "learning_rate": 0.00947892962472918, + "loss": 3.189, + "mean_token_accuracy": 0.4025598168373108, + "num_tokens": 5364223647.0, + "step": 10493 + }, + { + "epoch": 2.8377501352082204, + "grad_norm": 2.421875, + "learning_rate": 0.009477375837122182, + "loss": 2.8694, + "mean_token_accuracy": 0.4080774188041687, + "num_tokens": 5364747896.0, + "step": 10494 + }, + { + "epoch": 2.8380205516495405, + "grad_norm": 2.96875, + "learning_rate": 0.009475822096233773, + "loss": 3.0306, + "mean_token_accuracy": 0.45268771052360535, + "num_tokens": 5365207778.0, + "step": 10495 + }, + { + "epoch": 2.8382909680908597, + "grad_norm": 2.921875, + "learning_rate": 0.009474268402111621, + "loss": 2.8405, + "mean_token_accuracy": 0.4152968227863312, + "num_tokens": 5365731968.0, + "step": 10496 + }, + { + "epoch": 2.8385613845321798, + "grad_norm": 3.265625, + "learning_rate": 0.009472714754803404, + "loss": 2.9915, + "mean_token_accuracy": 0.43920770287513733, + "num_tokens": 5366208027.0, + "step": 10497 + }, + { + "epoch": 2.838831800973499, + "grad_norm": 3.453125, + "learning_rate": 0.009471161154356792, + "loss": 3.1669, + "mean_token_accuracy": 0.3915362060070038, + "num_tokens": 5366732285.0, + "step": 10498 + }, + { + "epoch": 2.839102217414819, + "grad_norm": 3.125, + "learning_rate": 0.009469607600819446, + "loss": 2.9826, + "mean_token_accuracy": 0.39869973063468933, + "num_tokens": 5367256429.0, + "step": 10499 + }, + { + "epoch": 2.8393726338561383, + "grad_norm": 2.8125, + "learning_rate": 0.00946805409423904, + "loss": 2.8729, + "mean_token_accuracy": 0.40894877910614014, + "num_tokens": 5367757064.0, + "step": 10500 + }, + { + "epoch": 2.8396430502974583, + "grad_norm": 2.578125, + "learning_rate": 0.009466500634663245, + "loss": 3.1185, + "mean_token_accuracy": 0.410106360912323, + "num_tokens": 5368246857.0, + "step": 10501 + }, + { + "epoch": 2.8399134667387775, + "grad_norm": 3.578125, + "learning_rate": 0.009464947222139712, + "loss": 3.1346, + "mean_token_accuracy": 0.4040977358818054, + "num_tokens": 5368771111.0, + "step": 10502 + }, + { + "epoch": 2.8401838831800976, + "grad_norm": 2.890625, + "learning_rate": 0.00946339385671612, + "loss": 3.2901, + "mean_token_accuracy": 0.4008733630180359, + "num_tokens": 5369290477.0, + "step": 10503 + }, + { + "epoch": 2.840454299621417, + "grad_norm": 3.40625, + "learning_rate": 0.009461840538440123, + "loss": 3.0494, + "mean_token_accuracy": 0.4084109663963318, + "num_tokens": 5369814680.0, + "step": 10504 + }, + { + "epoch": 2.8407247160627365, + "grad_norm": 2.484375, + "learning_rate": 0.009460287267359377, + "loss": 2.8573, + "mean_token_accuracy": 0.4304487705230713, + "num_tokens": 5370338730.0, + "step": 10505 + }, + { + "epoch": 2.840995132504056, + "grad_norm": 3.578125, + "learning_rate": 0.009458734043521549, + "loss": 3.1241, + "mean_token_accuracy": 0.3999236226081848, + "num_tokens": 5370807109.0, + "step": 10506 + }, + { + "epoch": 2.8412655489453758, + "grad_norm": 2.75, + "learning_rate": 0.009457180866974288, + "loss": 3.0212, + "mean_token_accuracy": 0.4174801707267761, + "num_tokens": 5371273379.0, + "step": 10507 + }, + { + "epoch": 2.8415359653866954, + "grad_norm": 2.515625, + "learning_rate": 0.009455627737765262, + "loss": 2.9333, + "mean_token_accuracy": 0.39966750144958496, + "num_tokens": 5371797657.0, + "step": 10508 + }, + { + "epoch": 2.841806381828015, + "grad_norm": 2.578125, + "learning_rate": 0.009454074655942118, + "loss": 3.1104, + "mean_token_accuracy": 0.40487855672836304, + "num_tokens": 5372321873.0, + "step": 10509 + }, + { + "epoch": 2.8420767982693347, + "grad_norm": 3.4375, + "learning_rate": 0.009452521621552504, + "loss": 2.9965, + "mean_token_accuracy": 0.4223800301551819, + "num_tokens": 5372846042.0, + "step": 10510 + }, + { + "epoch": 2.8423472147106543, + "grad_norm": 258.0, + "learning_rate": 0.009450968634644084, + "loss": 16.7089, + "mean_token_accuracy": 0.0025927405804395676, + "num_tokens": 5373362766.0, + "step": 10511 + }, + { + "epoch": 2.842617631151974, + "grad_norm": 6.21875, + "learning_rate": 0.0094494156952645, + "loss": 3.3804, + "mean_token_accuracy": 0.3938581943511963, + "num_tokens": 5373886918.0, + "step": 10512 + }, + { + "epoch": 2.8428880475932936, + "grad_norm": 2.015625, + "learning_rate": 0.0094478628034614, + "loss": 3.0476, + "mean_token_accuracy": 0.39344537258148193, + "num_tokens": 5374411137.0, + "step": 10513 + }, + { + "epoch": 2.8431584640346133, + "grad_norm": 2.71875, + "learning_rate": 0.009446309959282435, + "loss": 3.3189, + "mean_token_accuracy": 0.3855171501636505, + "num_tokens": 5374935315.0, + "step": 10514 + }, + { + "epoch": 2.843428880475933, + "grad_norm": 2.9375, + "learning_rate": 0.00944475716277525, + "loss": 3.0433, + "mean_token_accuracy": 0.4165429472923279, + "num_tokens": 5375459429.0, + "step": 10515 + }, + { + "epoch": 2.8436992969172525, + "grad_norm": 2.90625, + "learning_rate": 0.009443204413987485, + "loss": 3.0951, + "mean_token_accuracy": 0.3983646631240845, + "num_tokens": 5375952235.0, + "step": 10516 + }, + { + "epoch": 2.843969713358572, + "grad_norm": 3.140625, + "learning_rate": 0.009441651712966794, + "loss": 2.47, + "mean_token_accuracy": 0.4857900142669678, + "num_tokens": 5376476324.0, + "step": 10517 + }, + { + "epoch": 2.844240129799892, + "grad_norm": 2.96875, + "learning_rate": 0.00944009905976081, + "loss": 3.0394, + "mean_token_accuracy": 0.41320621967315674, + "num_tokens": 5377000492.0, + "step": 10518 + }, + { + "epoch": 2.8445105462412115, + "grad_norm": 3.546875, + "learning_rate": 0.009438546454417168, + "loss": 2.9632, + "mean_token_accuracy": 0.4128153324127197, + "num_tokens": 5377524640.0, + "step": 10519 + }, + { + "epoch": 2.844780962682531, + "grad_norm": 2.6875, + "learning_rate": 0.009436993896983518, + "loss": 2.9433, + "mean_token_accuracy": 0.44331789016723633, + "num_tokens": 5377991479.0, + "step": 10520 + }, + { + "epoch": 2.8450513791238508, + "grad_norm": 3.109375, + "learning_rate": 0.00943544138750749, + "loss": 3.0514, + "mean_token_accuracy": 0.41684284806251526, + "num_tokens": 5378485343.0, + "step": 10521 + }, + { + "epoch": 2.8453217955651704, + "grad_norm": 2.71875, + "learning_rate": 0.00943388892603672, + "loss": 2.8941, + "mean_token_accuracy": 0.4147648811340332, + "num_tokens": 5378916236.0, + "step": 10522 + }, + { + "epoch": 2.84559221200649, + "grad_norm": 2.5, + "learning_rate": 0.009432336512618846, + "loss": 3.0384, + "mean_token_accuracy": 0.41577231884002686, + "num_tokens": 5379440515.0, + "step": 10523 + }, + { + "epoch": 2.8458626284478097, + "grad_norm": 2.6875, + "learning_rate": 0.009430784147301498, + "loss": 3.1461, + "mean_token_accuracy": 0.3970297873020172, + "num_tokens": 5379964682.0, + "step": 10524 + }, + { + "epoch": 2.8461330448891293, + "grad_norm": 3.296875, + "learning_rate": 0.009429231830132305, + "loss": 2.994, + "mean_token_accuracy": 0.4069732427597046, + "num_tokens": 5380488880.0, + "step": 10525 + }, + { + "epoch": 2.846403461330449, + "grad_norm": 3.125, + "learning_rate": 0.009427679561158902, + "loss": 3.158, + "mean_token_accuracy": 0.4062870442867279, + "num_tokens": 5381013111.0, + "step": 10526 + }, + { + "epoch": 2.8466738777717686, + "grad_norm": 3.484375, + "learning_rate": 0.009426127340428912, + "loss": 3.1656, + "mean_token_accuracy": 0.3940773904323578, + "num_tokens": 5381537159.0, + "step": 10527 + }, + { + "epoch": 2.8469442942130883, + "grad_norm": 2.625, + "learning_rate": 0.009424575167989961, + "loss": 2.8788, + "mean_token_accuracy": 0.42049700021743774, + "num_tokens": 5382061359.0, + "step": 10528 + }, + { + "epoch": 2.847214710654408, + "grad_norm": 2.703125, + "learning_rate": 0.009423023043889681, + "loss": 3.0008, + "mean_token_accuracy": 0.4219590127468109, + "num_tokens": 5382585506.0, + "step": 10529 + }, + { + "epoch": 2.8474851270957275, + "grad_norm": 3.125, + "learning_rate": 0.009421470968175687, + "loss": 2.9304, + "mean_token_accuracy": 0.4164828360080719, + "num_tokens": 5383104293.0, + "step": 10530 + }, + { + "epoch": 2.847755543537047, + "grad_norm": 126.5, + "learning_rate": 0.00941991894089561, + "loss": 34.8013, + "mean_token_accuracy": 4.9831320211524144e-05, + "num_tokens": 5383628307.0, + "step": 10531 + }, + { + "epoch": 2.848025959978367, + "grad_norm": 5.375, + "learning_rate": 0.009418366962097066, + "loss": 3.3282, + "mean_token_accuracy": 0.37504976987838745, + "num_tokens": 5384152520.0, + "step": 10532 + }, + { + "epoch": 2.8482963764196865, + "grad_norm": 1.6640625, + "learning_rate": 0.009416815031827674, + "loss": 2.766, + "mean_token_accuracy": 0.42701220512390137, + "num_tokens": 5384676700.0, + "step": 10533 + }, + { + "epoch": 2.848566792861006, + "grad_norm": 2.625, + "learning_rate": 0.009415263150135056, + "loss": 3.2297, + "mean_token_accuracy": 0.4091448187828064, + "num_tokens": 5385168781.0, + "step": 10534 + }, + { + "epoch": 2.8488372093023253, + "grad_norm": 3.421875, + "learning_rate": 0.00941371131706682, + "loss": 3.1404, + "mean_token_accuracy": 0.4067627191543579, + "num_tokens": 5385693016.0, + "step": 10535 + }, + { + "epoch": 2.8491076257436454, + "grad_norm": 3.875, + "learning_rate": 0.009412159532670588, + "loss": 2.8337, + "mean_token_accuracy": 0.40846332907676697, + "num_tokens": 5386217228.0, + "step": 10536 + }, + { + "epoch": 2.8493780421849646, + "grad_norm": 2.375, + "learning_rate": 0.00941060779699397, + "loss": 2.8154, + "mean_token_accuracy": 0.4321225881576538, + "num_tokens": 5386741508.0, + "step": 10537 + }, + { + "epoch": 2.8496484586262847, + "grad_norm": 2.671875, + "learning_rate": 0.009409056110084581, + "loss": 2.9572, + "mean_token_accuracy": 0.41487425565719604, + "num_tokens": 5387265531.0, + "step": 10538 + }, + { + "epoch": 2.849918875067604, + "grad_norm": 3.09375, + "learning_rate": 0.009407504471990026, + "loss": 3.0988, + "mean_token_accuracy": 0.3989790976047516, + "num_tokens": 5387789726.0, + "step": 10539 + }, + { + "epoch": 2.850189291508924, + "grad_norm": 3.15625, + "learning_rate": 0.00940595288275792, + "loss": 3.252, + "mean_token_accuracy": 0.3714163303375244, + "num_tokens": 5388313922.0, + "step": 10540 + }, + { + "epoch": 2.850459707950243, + "grad_norm": 3.140625, + "learning_rate": 0.00940440134243587, + "loss": 2.859, + "mean_token_accuracy": 0.4573722779750824, + "num_tokens": 5388838081.0, + "step": 10541 + }, + { + "epoch": 2.8507301243915633, + "grad_norm": 1.8828125, + "learning_rate": 0.009402849851071472, + "loss": 2.9768, + "mean_token_accuracy": 0.4212923049926758, + "num_tokens": 5389362311.0, + "step": 10542 + }, + { + "epoch": 2.8510005408328825, + "grad_norm": 3.046875, + "learning_rate": 0.00940129840871234, + "loss": 2.94, + "mean_token_accuracy": 0.41868919134140015, + "num_tokens": 5389886506.0, + "step": 10543 + }, + { + "epoch": 2.8512709572742025, + "grad_norm": 3.171875, + "learning_rate": 0.009399747015406077, + "loss": 2.8959, + "mean_token_accuracy": 0.42307597398757935, + "num_tokens": 5390403154.0, + "step": 10544 + }, + { + "epoch": 2.8515413737155217, + "grad_norm": 3.953125, + "learning_rate": 0.009398195671200277, + "loss": 3.0748, + "mean_token_accuracy": 0.39406466484069824, + "num_tokens": 5390927417.0, + "step": 10545 + }, + { + "epoch": 2.8518117901568414, + "grad_norm": 3.265625, + "learning_rate": 0.009396644376142549, + "loss": 3.163, + "mean_token_accuracy": 0.40804725885391235, + "num_tokens": 5391451498.0, + "step": 10546 + }, + { + "epoch": 2.852082206598161, + "grad_norm": 3.125, + "learning_rate": 0.009395093130280488, + "loss": 2.8809, + "mean_token_accuracy": 0.39551377296447754, + "num_tokens": 5391975764.0, + "step": 10547 + }, + { + "epoch": 2.8523526230394807, + "grad_norm": 2.359375, + "learning_rate": 0.009393541933661684, + "loss": 3.072, + "mean_token_accuracy": 0.4065069556236267, + "num_tokens": 5392500013.0, + "step": 10548 + }, + { + "epoch": 2.8526230394808003, + "grad_norm": 3.796875, + "learning_rate": 0.009391990786333742, + "loss": 3.2453, + "mean_token_accuracy": 0.3877243399620056, + "num_tokens": 5392998787.0, + "step": 10549 + }, + { + "epoch": 2.85289345592212, + "grad_norm": 2.328125, + "learning_rate": 0.009390439688344248, + "loss": 2.8001, + "mean_token_accuracy": 0.41411304473876953, + "num_tokens": 5393472707.0, + "step": 10550 + }, + { + "epoch": 2.8531638723634396, + "grad_norm": 11.125, + "learning_rate": 0.009388888639740798, + "loss": 11.4564, + "mean_token_accuracy": 0.0002393648464931175, + "num_tokens": 5393904506.0, + "step": 10551 + }, + { + "epoch": 2.8534342888047592, + "grad_norm": 6.65625, + "learning_rate": 0.009387337640570986, + "loss": 3.3429, + "mean_token_accuracy": 0.34993237257003784, + "num_tokens": 5394428669.0, + "step": 10552 + }, + { + "epoch": 2.853704705246079, + "grad_norm": 3.375, + "learning_rate": 0.009385786690882395, + "loss": 3.1231, + "mean_token_accuracy": 0.39519891142845154, + "num_tokens": 5394952805.0, + "step": 10553 + }, + { + "epoch": 2.8539751216873985, + "grad_norm": 2.9375, + "learning_rate": 0.009384235790722615, + "loss": 3.2323, + "mean_token_accuracy": 0.39402782917022705, + "num_tokens": 5395476998.0, + "step": 10554 + }, + { + "epoch": 2.854245538128718, + "grad_norm": 3.75, + "learning_rate": 0.009382684940139236, + "loss": 3.1978, + "mean_token_accuracy": 0.4001561403274536, + "num_tokens": 5396001244.0, + "step": 10555 + }, + { + "epoch": 2.854515954570038, + "grad_norm": 2.296875, + "learning_rate": 0.009381134139179834, + "loss": 2.859, + "mean_token_accuracy": 0.4283040761947632, + "num_tokens": 5396519051.0, + "step": 10556 + }, + { + "epoch": 2.8547863710113575, + "grad_norm": 2.703125, + "learning_rate": 0.009379583387892002, + "loss": 2.9985, + "mean_token_accuracy": 0.3903994560241699, + "num_tokens": 5397038694.0, + "step": 10557 + }, + { + "epoch": 2.855056787452677, + "grad_norm": 2.84375, + "learning_rate": 0.009378032686323317, + "loss": 2.9593, + "mean_token_accuracy": 0.40284478664398193, + "num_tokens": 5397562900.0, + "step": 10558 + }, + { + "epoch": 2.8553272038939967, + "grad_norm": 2.6875, + "learning_rate": 0.009376482034521355, + "loss": 3.0651, + "mean_token_accuracy": 0.40083783864974976, + "num_tokens": 5398087178.0, + "step": 10559 + }, + { + "epoch": 2.8555976203353164, + "grad_norm": 2.84375, + "learning_rate": 0.009374931432533703, + "loss": 3.1985, + "mean_token_accuracy": 0.4094889760017395, + "num_tokens": 5398611416.0, + "step": 10560 + }, + { + "epoch": 2.855868036776636, + "grad_norm": 2.640625, + "learning_rate": 0.009373380880407931, + "loss": 3.0257, + "mean_token_accuracy": 0.4248717129230499, + "num_tokens": 5399091166.0, + "step": 10561 + }, + { + "epoch": 2.8561384532179557, + "grad_norm": 2.625, + "learning_rate": 0.009371830378191615, + "loss": 2.8614, + "mean_token_accuracy": 0.4270976781845093, + "num_tokens": 5399535057.0, + "step": 10562 + }, + { + "epoch": 2.8564088696592753, + "grad_norm": 3.15625, + "learning_rate": 0.009370279925932336, + "loss": 2.9524, + "mean_token_accuracy": 0.43787962198257446, + "num_tokens": 5399993814.0, + "step": 10563 + }, + { + "epoch": 2.856679286100595, + "grad_norm": 2.625, + "learning_rate": 0.00936872952367766, + "loss": 3.1119, + "mean_token_accuracy": 0.3990727663040161, + "num_tokens": 5400463470.0, + "step": 10564 + }, + { + "epoch": 2.8569497025419146, + "grad_norm": 3.0625, + "learning_rate": 0.009367179171475155, + "loss": 3.008, + "mean_token_accuracy": 0.4080439805984497, + "num_tokens": 5400987751.0, + "step": 10565 + }, + { + "epoch": 2.8572201189832342, + "grad_norm": 2.78125, + "learning_rate": 0.009365628869372403, + "loss": 2.8406, + "mean_token_accuracy": 0.4338923692703247, + "num_tokens": 5401511990.0, + "step": 10566 + }, + { + "epoch": 2.857490535424554, + "grad_norm": 2.96875, + "learning_rate": 0.009364078617416961, + "loss": 3.2692, + "mean_token_accuracy": 0.4065292775630951, + "num_tokens": 5401979898.0, + "step": 10567 + }, + { + "epoch": 2.8577609518658735, + "grad_norm": 3.109375, + "learning_rate": 0.009362528415656395, + "loss": 3.2293, + "mean_token_accuracy": 0.4012061357498169, + "num_tokens": 5402497947.0, + "step": 10568 + }, + { + "epoch": 2.858031368307193, + "grad_norm": 2.546875, + "learning_rate": 0.009360978264138273, + "loss": 2.7734, + "mean_token_accuracy": 0.47017303109169006, + "num_tokens": 5403022192.0, + "step": 10569 + }, + { + "epoch": 2.858301784748513, + "grad_norm": 2.953125, + "learning_rate": 0.009359428162910159, + "loss": 3.2505, + "mean_token_accuracy": 0.4035314917564392, + "num_tokens": 5403506769.0, + "step": 10570 + }, + { + "epoch": 2.8585722011898325, + "grad_norm": 160.0, + "learning_rate": 0.009357878112019614, + "loss": 23.3441, + "mean_token_accuracy": 2.018842496909201e-05, + "num_tokens": 5404020694.0, + "step": 10571 + }, + { + "epoch": 2.858842617631152, + "grad_norm": 6.21875, + "learning_rate": 0.009356328111514198, + "loss": 3.1187, + "mean_token_accuracy": 0.41024598479270935, + "num_tokens": 5404493598.0, + "step": 10572 + }, + { + "epoch": 2.8591130340724717, + "grad_norm": 2.796875, + "learning_rate": 0.00935477816144147, + "loss": 3.2042, + "mean_token_accuracy": 0.38322126865386963, + "num_tokens": 5405017851.0, + "step": 10573 + }, + { + "epoch": 2.8593834505137914, + "grad_norm": 2.28125, + "learning_rate": 0.009353228261848983, + "loss": 2.9592, + "mean_token_accuracy": 0.4170956611633301, + "num_tokens": 5405542076.0, + "step": 10574 + }, + { + "epoch": 2.859653866955111, + "grad_norm": 3.265625, + "learning_rate": 0.009351678412784298, + "loss": 3.108, + "mean_token_accuracy": 0.4000813961029053, + "num_tokens": 5406066280.0, + "step": 10575 + }, + { + "epoch": 2.8599242833964302, + "grad_norm": 2.953125, + "learning_rate": 0.009350128614294963, + "loss": 2.9012, + "mean_token_accuracy": 0.4033733010292053, + "num_tokens": 5406588177.0, + "step": 10576 + }, + { + "epoch": 2.8601946998377503, + "grad_norm": 3.140625, + "learning_rate": 0.009348578866428542, + "loss": 3.0538, + "mean_token_accuracy": 0.4160769581794739, + "num_tokens": 5407112459.0, + "step": 10577 + }, + { + "epoch": 2.8604651162790695, + "grad_norm": 2.90625, + "learning_rate": 0.009347029169232572, + "loss": 3.0123, + "mean_token_accuracy": 0.4092203974723816, + "num_tokens": 5407636736.0, + "step": 10578 + }, + { + "epoch": 2.8607355327203896, + "grad_norm": 3.171875, + "learning_rate": 0.009345479522754609, + "loss": 3.076, + "mean_token_accuracy": 0.4317604899406433, + "num_tokens": 5408137184.0, + "step": 10579 + }, + { + "epoch": 2.861005949161709, + "grad_norm": 3.359375, + "learning_rate": 0.0093439299270422, + "loss": 2.9243, + "mean_token_accuracy": 0.42236441373825073, + "num_tokens": 5408639430.0, + "step": 10580 + }, + { + "epoch": 2.861276365603029, + "grad_norm": 2.859375, + "learning_rate": 0.009342380382142893, + "loss": 3.0757, + "mean_token_accuracy": 0.4148574471473694, + "num_tokens": 5409152806.0, + "step": 10581 + }, + { + "epoch": 2.861546782044348, + "grad_norm": 3.53125, + "learning_rate": 0.009340830888104225, + "loss": 3.0601, + "mean_token_accuracy": 0.4130867123603821, + "num_tokens": 5409677019.0, + "step": 10582 + }, + { + "epoch": 2.861817198485668, + "grad_norm": 2.75, + "learning_rate": 0.00933928144497375, + "loss": 3.0273, + "mean_token_accuracy": 0.4098712205886841, + "num_tokens": 5410201283.0, + "step": 10583 + }, + { + "epoch": 2.8620876149269874, + "grad_norm": 3.734375, + "learning_rate": 0.009337732052799006, + "loss": 3.2338, + "mean_token_accuracy": 0.4119335412979126, + "num_tokens": 5410675517.0, + "step": 10584 + }, + { + "epoch": 2.8623580313683075, + "grad_norm": 3.25, + "learning_rate": 0.009336182711627526, + "loss": 2.8788, + "mean_token_accuracy": 0.43028178811073303, + "num_tokens": 5411199770.0, + "step": 10585 + }, + { + "epoch": 2.8626284478096267, + "grad_norm": 3.0625, + "learning_rate": 0.009334633421506856, + "loss": 3.0317, + "mean_token_accuracy": 0.41733139753341675, + "num_tokens": 5411724030.0, + "step": 10586 + }, + { + "epoch": 2.8628988642509463, + "grad_norm": 3.609375, + "learning_rate": 0.009333084182484531, + "loss": 3.0012, + "mean_token_accuracy": 0.4021546244621277, + "num_tokens": 5412248039.0, + "step": 10587 + }, + { + "epoch": 2.863169280692266, + "grad_norm": 2.3125, + "learning_rate": 0.009331534994608079, + "loss": 3.0531, + "mean_token_accuracy": 0.4262515902519226, + "num_tokens": 5412734900.0, + "step": 10588 + }, + { + "epoch": 2.8634396971335856, + "grad_norm": 3.21875, + "learning_rate": 0.009329985857925048, + "loss": 3.0111, + "mean_token_accuracy": 0.4071301519870758, + "num_tokens": 5413259032.0, + "step": 10589 + }, + { + "epoch": 2.8637101135749052, + "grad_norm": 2.828125, + "learning_rate": 0.00932843677248296, + "loss": 3.2225, + "mean_token_accuracy": 0.39648735523223877, + "num_tokens": 5413783236.0, + "step": 10590 + }, + { + "epoch": 2.863980530016225, + "grad_norm": 88.0, + "learning_rate": 0.009326887738329345, + "loss": 16.0002, + "mean_token_accuracy": 0.03899715095758438, + "num_tokens": 5414307498.0, + "step": 10591 + }, + { + "epoch": 2.8642509464575445, + "grad_norm": 6.75, + "learning_rate": 0.009325338755511734, + "loss": 3.3009, + "mean_token_accuracy": 0.38491910696029663, + "num_tokens": 5414780898.0, + "step": 10592 + }, + { + "epoch": 2.864521362898864, + "grad_norm": 2.125, + "learning_rate": 0.00932378982407766, + "loss": 3.1727, + "mean_token_accuracy": 0.3847074508666992, + "num_tokens": 5415305086.0, + "step": 10593 + }, + { + "epoch": 2.864791779340184, + "grad_norm": 3.125, + "learning_rate": 0.009322240944074635, + "loss": 3.2589, + "mean_token_accuracy": 0.35870838165283203, + "num_tokens": 5415829365.0, + "step": 10594 + }, + { + "epoch": 2.8650621957815035, + "grad_norm": 2.984375, + "learning_rate": 0.0093206921155502, + "loss": 3.0504, + "mean_token_accuracy": 0.3966866433620453, + "num_tokens": 5416353613.0, + "step": 10595 + }, + { + "epoch": 2.865332612222823, + "grad_norm": 2.6875, + "learning_rate": 0.009319143338551867, + "loss": 2.987, + "mean_token_accuracy": 0.4277186095714569, + "num_tokens": 5416816990.0, + "step": 10596 + }, + { + "epoch": 2.8656030286641427, + "grad_norm": 2.8125, + "learning_rate": 0.009317594613127159, + "loss": 3.0383, + "mean_token_accuracy": 0.41769546270370483, + "num_tokens": 5417319409.0, + "step": 10597 + }, + { + "epoch": 2.8658734451054624, + "grad_norm": 3.3125, + "learning_rate": 0.009316045939323597, + "loss": 2.9285, + "mean_token_accuracy": 0.4238325357437134, + "num_tokens": 5417843542.0, + "step": 10598 + }, + { + "epoch": 2.866143861546782, + "grad_norm": 8.5, + "learning_rate": 0.009314497317188693, + "loss": 2.811, + "mean_token_accuracy": 0.45353975892066956, + "num_tokens": 5418367726.0, + "step": 10599 + }, + { + "epoch": 2.8664142779881017, + "grad_norm": 1.7109375, + "learning_rate": 0.009312948746769972, + "loss": 2.9447, + "mean_token_accuracy": 0.4191592037677765, + "num_tokens": 5418891760.0, + "step": 10600 + }, + { + "epoch": 2.8666846944294213, + "grad_norm": 2.796875, + "learning_rate": 0.009311400228114946, + "loss": 3.0133, + "mean_token_accuracy": 0.41457822918891907, + "num_tokens": 5419416008.0, + "step": 10601 + }, + { + "epoch": 2.866955110870741, + "grad_norm": 2.9375, + "learning_rate": 0.009309851761271122, + "loss": 2.8359, + "mean_token_accuracy": 0.419328898191452, + "num_tokens": 5419940109.0, + "step": 10602 + }, + { + "epoch": 2.8672255273120606, + "grad_norm": 3.125, + "learning_rate": 0.009308303346286023, + "loss": 2.8642, + "mean_token_accuracy": 0.453721284866333, + "num_tokens": 5420431059.0, + "step": 10603 + }, + { + "epoch": 2.8674959437533802, + "grad_norm": 2.140625, + "learning_rate": 0.009306754983207152, + "loss": 2.9683, + "mean_token_accuracy": 0.4205143451690674, + "num_tokens": 5420915799.0, + "step": 10604 + }, + { + "epoch": 2.8677663601947, + "grad_norm": 2.734375, + "learning_rate": 0.00930520667208201, + "loss": 3.0157, + "mean_token_accuracy": 0.40418434143066406, + "num_tokens": 5421440071.0, + "step": 10605 + }, + { + "epoch": 2.8680367766360195, + "grad_norm": 2.734375, + "learning_rate": 0.009303658412958118, + "loss": 2.9238, + "mean_token_accuracy": 0.40453964471817017, + "num_tokens": 5421964274.0, + "step": 10606 + }, + { + "epoch": 2.868307193077339, + "grad_norm": 2.375, + "learning_rate": 0.009302110205882973, + "loss": 2.9878, + "mean_token_accuracy": 0.4199265241622925, + "num_tokens": 5422432679.0, + "step": 10607 + }, + { + "epoch": 2.868577609518659, + "grad_norm": 2.609375, + "learning_rate": 0.009300562050904076, + "loss": 2.9428, + "mean_token_accuracy": 0.4372667074203491, + "num_tokens": 5422956553.0, + "step": 10608 + }, + { + "epoch": 2.8688480259599785, + "grad_norm": 3.0625, + "learning_rate": 0.009299013948068938, + "loss": 3.1078, + "mean_token_accuracy": 0.40243998169898987, + "num_tokens": 5423480825.0, + "step": 10609 + }, + { + "epoch": 2.869118442401298, + "grad_norm": 2.625, + "learning_rate": 0.009297465897425054, + "loss": 2.8521, + "mean_token_accuracy": 0.4226979613304138, + "num_tokens": 5424004996.0, + "step": 10610 + }, + { + "epoch": 2.8693888588426177, + "grad_norm": 276.0, + "learning_rate": 0.009295917899019918, + "loss": 22.19, + "mean_token_accuracy": 0.0, + "num_tokens": 5424529281.0, + "step": 10611 + }, + { + "epoch": 2.8696592752839374, + "grad_norm": 7.71875, + "learning_rate": 0.009294369952901041, + "loss": 3.309, + "mean_token_accuracy": 0.366092324256897, + "num_tokens": 5425053536.0, + "step": 10612 + }, + { + "epoch": 2.869929691725257, + "grad_norm": 2.40625, + "learning_rate": 0.009292822059115904, + "loss": 3.1069, + "mean_token_accuracy": 0.4113065004348755, + "num_tokens": 5425577795.0, + "step": 10613 + }, + { + "epoch": 2.8702001081665767, + "grad_norm": 3.515625, + "learning_rate": 0.009291274217712007, + "loss": 3.0264, + "mean_token_accuracy": 0.41303861141204834, + "num_tokens": 5426071558.0, + "step": 10614 + }, + { + "epoch": 2.8704705246078963, + "grad_norm": 2.78125, + "learning_rate": 0.009289726428736843, + "loss": 2.9014, + "mean_token_accuracy": 0.42656761407852173, + "num_tokens": 5426532366.0, + "step": 10615 + }, + { + "epoch": 2.870740941049216, + "grad_norm": 3.859375, + "learning_rate": 0.009288178692237904, + "loss": 2.8064, + "mean_token_accuracy": 0.407299280166626, + "num_tokens": 5427056547.0, + "step": 10616 + }, + { + "epoch": 2.871011357490535, + "grad_norm": 3.0, + "learning_rate": 0.00928663100826267, + "loss": 2.7083, + "mean_token_accuracy": 0.4468786120414734, + "num_tokens": 5427517375.0, + "step": 10617 + }, + { + "epoch": 2.8712817739318552, + "grad_norm": 2.96875, + "learning_rate": 0.009285083376858641, + "loss": 3.0679, + "mean_token_accuracy": 0.40219244360923767, + "num_tokens": 5428041648.0, + "step": 10618 + }, + { + "epoch": 2.8715521903731744, + "grad_norm": 2.453125, + "learning_rate": 0.009283535798073297, + "loss": 3.0587, + "mean_token_accuracy": 0.39179009199142456, + "num_tokens": 5428557873.0, + "step": 10619 + }, + { + "epoch": 2.8718226068144945, + "grad_norm": 3.234375, + "learning_rate": 0.00928198827195412, + "loss": 3.2146, + "mean_token_accuracy": 0.397585391998291, + "num_tokens": 5429074294.0, + "step": 10620 + }, + { + "epoch": 2.8720930232558137, + "grad_norm": 2.78125, + "learning_rate": 0.009280440798548594, + "loss": 2.7555, + "mean_token_accuracy": 0.43395155668258667, + "num_tokens": 5429549752.0, + "step": 10621 + }, + { + "epoch": 2.872363439697134, + "grad_norm": 3.078125, + "learning_rate": 0.009278893377904203, + "loss": 2.9382, + "mean_token_accuracy": 0.41949713230133057, + "num_tokens": 5430066657.0, + "step": 10622 + }, + { + "epoch": 2.872633856138453, + "grad_norm": 3.171875, + "learning_rate": 0.009277346010068426, + "loss": 3.0514, + "mean_token_accuracy": 0.3853660225868225, + "num_tokens": 5430590727.0, + "step": 10623 + }, + { + "epoch": 2.872904272579773, + "grad_norm": 5.125, + "learning_rate": 0.009275798695088742, + "loss": 2.7479, + "mean_token_accuracy": 0.45719319581985474, + "num_tokens": 5431114853.0, + "step": 10624 + }, + { + "epoch": 2.8731746890210923, + "grad_norm": 2.484375, + "learning_rate": 0.00927425143301262, + "loss": 3.0543, + "mean_token_accuracy": 0.43344372510910034, + "num_tokens": 5431523871.0, + "step": 10625 + }, + { + "epoch": 2.8734451054624124, + "grad_norm": 3.4375, + "learning_rate": 0.00927270422388754, + "loss": 3.1131, + "mean_token_accuracy": 0.39865386486053467, + "num_tokens": 5432048151.0, + "step": 10626 + }, + { + "epoch": 2.8737155219037316, + "grad_norm": 2.5625, + "learning_rate": 0.009271157067760976, + "loss": 3.0954, + "mean_token_accuracy": 0.4162495732307434, + "num_tokens": 5432572378.0, + "step": 10627 + }, + { + "epoch": 2.8739859383450512, + "grad_norm": 3.140625, + "learning_rate": 0.009269609964680394, + "loss": 2.8607, + "mean_token_accuracy": 0.44378232955932617, + "num_tokens": 5433096636.0, + "step": 10628 + }, + { + "epoch": 2.874256354786371, + "grad_norm": 2.46875, + "learning_rate": 0.00926806291469327, + "loss": 3.0866, + "mean_token_accuracy": 0.4114499092102051, + "num_tokens": 5433575188.0, + "step": 10629 + }, + { + "epoch": 2.8745267712276905, + "grad_norm": 3.828125, + "learning_rate": 0.009266515917847069, + "loss": 3.1436, + "mean_token_accuracy": 0.40638381242752075, + "num_tokens": 5434099444.0, + "step": 10630 + }, + { + "epoch": 2.87479718766901, + "grad_norm": 57.25, + "learning_rate": 0.009264968974189254, + "loss": 17.2935, + "mean_token_accuracy": 0.0, + "num_tokens": 5434623547.0, + "step": 10631 + }, + { + "epoch": 2.87506760411033, + "grad_norm": 4.65625, + "learning_rate": 0.009263422083767297, + "loss": 3.285, + "mean_token_accuracy": 0.38761258125305176, + "num_tokens": 5435084524.0, + "step": 10632 + }, + { + "epoch": 2.8753380205516494, + "grad_norm": 2.078125, + "learning_rate": 0.009261875246628658, + "loss": 2.8616, + "mean_token_accuracy": 0.41778457164764404, + "num_tokens": 5435608702.0, + "step": 10633 + }, + { + "epoch": 2.875608436992969, + "grad_norm": 6.59375, + "learning_rate": 0.009260328462820792, + "loss": 3.0901, + "mean_token_accuracy": 0.427509605884552, + "num_tokens": 5436132712.0, + "step": 10634 + }, + { + "epoch": 2.8758788534342887, + "grad_norm": 16.75, + "learning_rate": 0.009258781732391171, + "loss": 3.1008, + "mean_token_accuracy": 0.38179001212120056, + "num_tokens": 5436656924.0, + "step": 10635 + }, + { + "epoch": 2.8761492698756084, + "grad_norm": 3.03125, + "learning_rate": 0.009257235055387246, + "loss": 3.1916, + "mean_token_accuracy": 0.38869941234588623, + "num_tokens": 5437147009.0, + "step": 10636 + }, + { + "epoch": 2.876419686316928, + "grad_norm": 2.15625, + "learning_rate": 0.00925568843185647, + "loss": 2.9908, + "mean_token_accuracy": 0.42952263355255127, + "num_tokens": 5437646868.0, + "step": 10637 + }, + { + "epoch": 2.8766901027582477, + "grad_norm": 2.734375, + "learning_rate": 0.009254141861846308, + "loss": 2.7475, + "mean_token_accuracy": 0.469013512134552, + "num_tokens": 5438048849.0, + "step": 10638 + }, + { + "epoch": 2.8769605191995673, + "grad_norm": 2.625, + "learning_rate": 0.00925259534540421, + "loss": 3.1276, + "mean_token_accuracy": 0.4004841446876526, + "num_tokens": 5438539169.0, + "step": 10639 + }, + { + "epoch": 2.877230935640887, + "grad_norm": 3.46875, + "learning_rate": 0.009251048882577618, + "loss": 3.0809, + "mean_token_accuracy": 0.39978402853012085, + "num_tokens": 5439063415.0, + "step": 10640 + }, + { + "epoch": 2.8775013520822066, + "grad_norm": 3.09375, + "learning_rate": 0.009249502473413995, + "loss": 3.0674, + "mean_token_accuracy": 0.43806880712509155, + "num_tokens": 5439560747.0, + "step": 10641 + }, + { + "epoch": 2.8777717685235262, + "grad_norm": 3.046875, + "learning_rate": 0.00924795611796078, + "loss": 3.2937, + "mean_token_accuracy": 0.37472882866859436, + "num_tokens": 5440084774.0, + "step": 10642 + }, + { + "epoch": 2.878042184964846, + "grad_norm": 2.671875, + "learning_rate": 0.009246409816265426, + "loss": 3.0028, + "mean_token_accuracy": 0.41439002752304077, + "num_tokens": 5440609048.0, + "step": 10643 + }, + { + "epoch": 2.8783126014061655, + "grad_norm": 2.625, + "learning_rate": 0.009244863568375377, + "loss": 2.9188, + "mean_token_accuracy": 0.4136319160461426, + "num_tokens": 5441133212.0, + "step": 10644 + }, + { + "epoch": 2.878583017847485, + "grad_norm": 2.828125, + "learning_rate": 0.009243317374338072, + "loss": 3.0003, + "mean_token_accuracy": 0.42467331886291504, + "num_tokens": 5441657384.0, + "step": 10645 + }, + { + "epoch": 2.878853434288805, + "grad_norm": 4.53125, + "learning_rate": 0.00924177123420096, + "loss": 3.0559, + "mean_token_accuracy": 0.43728184700012207, + "num_tokens": 5442117263.0, + "step": 10646 + }, + { + "epoch": 2.8791238507301244, + "grad_norm": 3.1875, + "learning_rate": 0.009240225148011476, + "loss": 3.0663, + "mean_token_accuracy": 0.40262287855148315, + "num_tokens": 5442641465.0, + "step": 10647 + }, + { + "epoch": 2.879394267171444, + "grad_norm": 3.5, + "learning_rate": 0.009238679115817055, + "loss": 3.1003, + "mean_token_accuracy": 0.4011092185974121, + "num_tokens": 5443165716.0, + "step": 10648 + }, + { + "epoch": 2.8796646836127637, + "grad_norm": 2.703125, + "learning_rate": 0.009237133137665146, + "loss": 3.0175, + "mean_token_accuracy": 0.42860081791877747, + "num_tokens": 5443689895.0, + "step": 10649 + }, + { + "epoch": 2.8799351000540834, + "grad_norm": 2.453125, + "learning_rate": 0.009235587213603172, + "loss": 2.8015, + "mean_token_accuracy": 0.4232461750507355, + "num_tokens": 5444213789.0, + "step": 10650 + }, + { + "epoch": 2.880205516495403, + "grad_norm": 144.0, + "learning_rate": 0.00923404134367857, + "loss": 15.5083, + "mean_token_accuracy": 0.0, + "num_tokens": 5444737958.0, + "step": 10651 + }, + { + "epoch": 2.8804759329367227, + "grad_norm": 6.3125, + "learning_rate": 0.009232495527938776, + "loss": 3.6666, + "mean_token_accuracy": 0.3551342785358429, + "num_tokens": 5445262166.0, + "step": 10652 + }, + { + "epoch": 2.8807463493780423, + "grad_norm": 2.15625, + "learning_rate": 0.009230949766431218, + "loss": 3.0259, + "mean_token_accuracy": 0.4206373691558838, + "num_tokens": 5445786374.0, + "step": 10653 + }, + { + "epoch": 2.881016765819362, + "grad_norm": 2.75, + "learning_rate": 0.00922940405920332, + "loss": 2.9421, + "mean_token_accuracy": 0.4213186204433441, + "num_tokens": 5446298397.0, + "step": 10654 + }, + { + "epoch": 2.8812871822606816, + "grad_norm": 2.484375, + "learning_rate": 0.009227858406302514, + "loss": 3.1322, + "mean_token_accuracy": 0.4028642475605011, + "num_tokens": 5446822676.0, + "step": 10655 + }, + { + "epoch": 2.8815575987020012, + "grad_norm": 2.609375, + "learning_rate": 0.009226312807776223, + "loss": 2.9024, + "mean_token_accuracy": 0.43509507179260254, + "num_tokens": 5447346915.0, + "step": 10656 + }, + { + "epoch": 2.881828015143321, + "grad_norm": 2.40625, + "learning_rate": 0.009224767263671869, + "loss": 2.952, + "mean_token_accuracy": 0.40303492546081543, + "num_tokens": 5447871177.0, + "step": 10657 + }, + { + "epoch": 2.88209843158464, + "grad_norm": 2.8125, + "learning_rate": 0.00922322177403688, + "loss": 3.0596, + "mean_token_accuracy": 0.40064704418182373, + "num_tokens": 5448395296.0, + "step": 10658 + }, + { + "epoch": 2.88236884802596, + "grad_norm": 3.375, + "learning_rate": 0.009221676338918674, + "loss": 3.0978, + "mean_token_accuracy": 0.39765000343322754, + "num_tokens": 5448919430.0, + "step": 10659 + }, + { + "epoch": 2.8826392644672794, + "grad_norm": 2.34375, + "learning_rate": 0.00922013095836466, + "loss": 2.9034, + "mean_token_accuracy": 0.421982079744339, + "num_tokens": 5449443700.0, + "step": 10660 + }, + { + "epoch": 2.8829096809085994, + "grad_norm": 2.875, + "learning_rate": 0.009218585632422269, + "loss": 3.0114, + "mean_token_accuracy": 0.4010227918624878, + "num_tokens": 5449936143.0, + "step": 10661 + }, + { + "epoch": 2.8831800973499186, + "grad_norm": 2.84375, + "learning_rate": 0.009217040361138908, + "loss": 2.8384, + "mean_token_accuracy": 0.4130236506462097, + "num_tokens": 5450460351.0, + "step": 10662 + }, + { + "epoch": 2.8834505137912387, + "grad_norm": 2.625, + "learning_rate": 0.009215495144561991, + "loss": 3.0821, + "mean_token_accuracy": 0.4187161922454834, + "num_tokens": 5450946137.0, + "step": 10663 + }, + { + "epoch": 2.883720930232558, + "grad_norm": 3.25, + "learning_rate": 0.009213949982738931, + "loss": 3.21, + "mean_token_accuracy": 0.4074987769126892, + "num_tokens": 5451470396.0, + "step": 10664 + }, + { + "epoch": 2.883991346673878, + "grad_norm": 3.046875, + "learning_rate": 0.009212404875717142, + "loss": 2.86, + "mean_token_accuracy": 0.4151257872581482, + "num_tokens": 5451994549.0, + "step": 10665 + }, + { + "epoch": 2.884261763115197, + "grad_norm": 2.515625, + "learning_rate": 0.009210859823544023, + "loss": 3.0438, + "mean_token_accuracy": 0.4381009340286255, + "num_tokens": 5452420163.0, + "step": 10666 + }, + { + "epoch": 2.8845321795565173, + "grad_norm": 2.546875, + "learning_rate": 0.009209314826266989, + "loss": 3.0467, + "mean_token_accuracy": 0.4151025712490082, + "num_tokens": 5452908641.0, + "step": 10667 + }, + { + "epoch": 2.8848025959978365, + "grad_norm": 3.03125, + "learning_rate": 0.009207769883933436, + "loss": 3.1044, + "mean_token_accuracy": 0.4035390615463257, + "num_tokens": 5453432894.0, + "step": 10668 + }, + { + "epoch": 2.885073012439156, + "grad_norm": 2.859375, + "learning_rate": 0.009206224996590781, + "loss": 3.0221, + "mean_token_accuracy": 0.4228992462158203, + "num_tokens": 5453902880.0, + "step": 10669 + }, + { + "epoch": 2.885343428880476, + "grad_norm": 3.015625, + "learning_rate": 0.009204680164286416, + "loss": 3.0894, + "mean_token_accuracy": 0.39147645235061646, + "num_tokens": 5454426934.0, + "step": 10670 + }, + { + "epoch": 2.8856138453217954, + "grad_norm": 86.0, + "learning_rate": 0.009203135387067742, + "loss": 13.5727, + "mean_token_accuracy": 0.006796166300773621, + "num_tokens": 5454914515.0, + "step": 10671 + }, + { + "epoch": 2.885884261763115, + "grad_norm": 7.125, + "learning_rate": 0.00920159066498216, + "loss": 3.4286, + "mean_token_accuracy": 0.33838120102882385, + "num_tokens": 5455438526.0, + "step": 10672 + }, + { + "epoch": 2.8861546782044347, + "grad_norm": 22.625, + "learning_rate": 0.009200045998077068, + "loss": 3.4251, + "mean_token_accuracy": 0.3421333432197571, + "num_tokens": 5455962727.0, + "step": 10673 + }, + { + "epoch": 2.8864250946457544, + "grad_norm": 3.359375, + "learning_rate": 0.009198501386399851, + "loss": 3.0428, + "mean_token_accuracy": 0.38906043767929077, + "num_tokens": 5456478281.0, + "step": 10674 + }, + { + "epoch": 2.886695511087074, + "grad_norm": 2.515625, + "learning_rate": 0.009196956829997915, + "loss": 2.867, + "mean_token_accuracy": 0.4058293104171753, + "num_tokens": 5457002456.0, + "step": 10675 + }, + { + "epoch": 2.8869659275283936, + "grad_norm": 3.125, + "learning_rate": 0.009195412328918646, + "loss": 3.0074, + "mean_token_accuracy": 0.4079359173774719, + "num_tokens": 5457526518.0, + "step": 10676 + }, + { + "epoch": 2.8872363439697133, + "grad_norm": 3.265625, + "learning_rate": 0.00919386788320943, + "loss": 3.122, + "mean_token_accuracy": 0.41516947746276855, + "num_tokens": 5457993367.0, + "step": 10677 + }, + { + "epoch": 2.887506760411033, + "grad_norm": 3.0, + "learning_rate": 0.00919232349291766, + "loss": 3.0812, + "mean_token_accuracy": 0.4134846329689026, + "num_tokens": 5458517640.0, + "step": 10678 + }, + { + "epoch": 2.8877771768523526, + "grad_norm": 2.796875, + "learning_rate": 0.009190779158090726, + "loss": 3.1003, + "mean_token_accuracy": 0.41037678718566895, + "num_tokens": 5459041721.0, + "step": 10679 + }, + { + "epoch": 2.888047593293672, + "grad_norm": 2.6875, + "learning_rate": 0.009189234878776001, + "loss": 3.1283, + "mean_token_accuracy": 0.3985869586467743, + "num_tokens": 5459565917.0, + "step": 10680 + }, + { + "epoch": 2.888318009734992, + "grad_norm": 3.0625, + "learning_rate": 0.00918769065502088, + "loss": 3.234, + "mean_token_accuracy": 0.3782995939254761, + "num_tokens": 5460090192.0, + "step": 10681 + }, + { + "epoch": 2.8885884261763115, + "grad_norm": 2.515625, + "learning_rate": 0.009186146486872738, + "loss": 3.0635, + "mean_token_accuracy": 0.4053632915019989, + "num_tokens": 5460614379.0, + "step": 10682 + }, + { + "epoch": 2.888858842617631, + "grad_norm": 3.703125, + "learning_rate": 0.009184602374378955, + "loss": 2.9467, + "mean_token_accuracy": 0.4091033935546875, + "num_tokens": 5461068999.0, + "step": 10683 + }, + { + "epoch": 2.889129259058951, + "grad_norm": 3.234375, + "learning_rate": 0.009183058317586909, + "loss": 2.9708, + "mean_token_accuracy": 0.4204431176185608, + "num_tokens": 5461593108.0, + "step": 10684 + }, + { + "epoch": 2.8893996755002704, + "grad_norm": 3.421875, + "learning_rate": 0.00918151431654398, + "loss": 2.9889, + "mean_token_accuracy": 0.42451733350753784, + "num_tokens": 5462104178.0, + "step": 10685 + }, + { + "epoch": 2.88967009194159, + "grad_norm": 2.8125, + "learning_rate": 0.009179970371297535, + "loss": 3.0715, + "mean_token_accuracy": 0.41407620906829834, + "num_tokens": 5462605288.0, + "step": 10686 + }, + { + "epoch": 2.8899405083829097, + "grad_norm": 3.015625, + "learning_rate": 0.009178426481894954, + "loss": 3.1976, + "mean_token_accuracy": 0.3796426057815552, + "num_tokens": 5463129387.0, + "step": 10687 + }, + { + "epoch": 2.8902109248242294, + "grad_norm": 3.015625, + "learning_rate": 0.009176882648383608, + "loss": 3.1411, + "mean_token_accuracy": 0.41487282514572144, + "num_tokens": 5463653653.0, + "step": 10688 + }, + { + "epoch": 2.890481341265549, + "grad_norm": 3.328125, + "learning_rate": 0.00917533887081086, + "loss": 3.0821, + "mean_token_accuracy": 0.4135168194770813, + "num_tokens": 5464177838.0, + "step": 10689 + }, + { + "epoch": 2.8907517577068687, + "grad_norm": 2.421875, + "learning_rate": 0.009173795149224085, + "loss": 2.9699, + "mean_token_accuracy": 0.4162871837615967, + "num_tokens": 5464702011.0, + "step": 10690 + }, + { + "epoch": 2.8910221741481883, + "grad_norm": 50.25, + "learning_rate": 0.009172251483670641, + "loss": 11.0171, + "mean_token_accuracy": 0.01670161262154579, + "num_tokens": 5465226144.0, + "step": 10691 + }, + { + "epoch": 2.891292590589508, + "grad_norm": 6.59375, + "learning_rate": 0.009170707874197898, + "loss": 3.3107, + "mean_token_accuracy": 0.3554840087890625, + "num_tokens": 5465695818.0, + "step": 10692 + }, + { + "epoch": 2.8915630070308276, + "grad_norm": 1.875, + "learning_rate": 0.009169164320853221, + "loss": 2.9598, + "mean_token_accuracy": 0.4199756383895874, + "num_tokens": 5466220069.0, + "step": 10693 + }, + { + "epoch": 2.8918334234721472, + "grad_norm": 2.609375, + "learning_rate": 0.00916762082368396, + "loss": 2.7736, + "mean_token_accuracy": 0.42209333181381226, + "num_tokens": 5466706351.0, + "step": 10694 + }, + { + "epoch": 2.892103839913467, + "grad_norm": 3.390625, + "learning_rate": 0.009166077382737486, + "loss": 2.902, + "mean_token_accuracy": 0.4256431460380554, + "num_tokens": 5467230492.0, + "step": 10695 + }, + { + "epoch": 2.8923742563547865, + "grad_norm": 3.03125, + "learning_rate": 0.00916453399806115, + "loss": 2.9703, + "mean_token_accuracy": 0.41360002756118774, + "num_tokens": 5467698643.0, + "step": 10696 + }, + { + "epoch": 2.892644672796106, + "grad_norm": 2.578125, + "learning_rate": 0.009162990669702307, + "loss": 2.8502, + "mean_token_accuracy": 0.4320108890533447, + "num_tokens": 5468186822.0, + "step": 10697 + }, + { + "epoch": 2.892915089237426, + "grad_norm": 2.875, + "learning_rate": 0.009161447397708316, + "loss": 3.2005, + "mean_token_accuracy": 0.40657520294189453, + "num_tokens": 5468666494.0, + "step": 10698 + }, + { + "epoch": 2.893185505678745, + "grad_norm": 2.921875, + "learning_rate": 0.009159904182126524, + "loss": 2.9977, + "mean_token_accuracy": 0.41027864813804626, + "num_tokens": 5469190735.0, + "step": 10699 + }, + { + "epoch": 2.893455922120065, + "grad_norm": 2.734375, + "learning_rate": 0.009158361023004281, + "loss": 3.0123, + "mean_token_accuracy": 0.4639609158039093, + "num_tokens": 5469650957.0, + "step": 10700 + }, + { + "epoch": 2.8937263385613843, + "grad_norm": 3.171875, + "learning_rate": 0.009156817920388938, + "loss": 2.8532, + "mean_token_accuracy": 0.4258039593696594, + "num_tokens": 5470175114.0, + "step": 10701 + }, + { + "epoch": 2.8939967550027044, + "grad_norm": 3.15625, + "learning_rate": 0.009155274874327846, + "loss": 3.1275, + "mean_token_accuracy": 0.4064052700996399, + "num_tokens": 5470699303.0, + "step": 10702 + }, + { + "epoch": 2.8942671714440236, + "grad_norm": 3.015625, + "learning_rate": 0.00915373188486834, + "loss": 3.1366, + "mean_token_accuracy": 0.40984654426574707, + "num_tokens": 5471223483.0, + "step": 10703 + }, + { + "epoch": 2.8945375878853437, + "grad_norm": 2.828125, + "learning_rate": 0.009152188952057773, + "loss": 3.2177, + "mean_token_accuracy": 0.3987022638320923, + "num_tokens": 5471747671.0, + "step": 10704 + }, + { + "epoch": 2.894808004326663, + "grad_norm": 2.578125, + "learning_rate": 0.00915064607594348, + "loss": 2.9136, + "mean_token_accuracy": 0.4174044132232666, + "num_tokens": 5472271774.0, + "step": 10705 + }, + { + "epoch": 2.895078420767983, + "grad_norm": 3.484375, + "learning_rate": 0.009149103256572809, + "loss": 3.0162, + "mean_token_accuracy": 0.4076458811759949, + "num_tokens": 5472796054.0, + "step": 10706 + }, + { + "epoch": 2.895348837209302, + "grad_norm": 3.3125, + "learning_rate": 0.009147560493993089, + "loss": 3.0456, + "mean_token_accuracy": 0.417484849691391, + "num_tokens": 5473197902.0, + "step": 10707 + }, + { + "epoch": 2.8956192536506222, + "grad_norm": 3.0, + "learning_rate": 0.009146017788251661, + "loss": 2.8743, + "mean_token_accuracy": 0.42764660716056824, + "num_tokens": 5473722013.0, + "step": 10708 + }, + { + "epoch": 2.8958896700919414, + "grad_norm": 7.375, + "learning_rate": 0.00914447513939586, + "loss": 2.8652, + "mean_token_accuracy": 0.4569885730743408, + "num_tokens": 5474246245.0, + "step": 10709 + }, + { + "epoch": 2.896160086533261, + "grad_norm": 2.109375, + "learning_rate": 0.009142932547473021, + "loss": 2.9034, + "mean_token_accuracy": 0.4052219092845917, + "num_tokens": 5474770481.0, + "step": 10710 + }, + { + "epoch": 2.8964305029745807, + "grad_norm": 16.875, + "learning_rate": 0.00914139001253047, + "loss": 10.5551, + "mean_token_accuracy": 9.422559378435835e-05, + "num_tokens": 5475294740.0, + "step": 10711 + }, + { + "epoch": 2.8967009194159004, + "grad_norm": 5.71875, + "learning_rate": 0.009139847534615539, + "loss": 3.3542, + "mean_token_accuracy": 0.3853577971458435, + "num_tokens": 5475819018.0, + "step": 10712 + }, + { + "epoch": 2.89697133585722, + "grad_norm": 2.59375, + "learning_rate": 0.009138305113775558, + "loss": 2.9784, + "mean_token_accuracy": 0.4206414818763733, + "num_tokens": 5476343206.0, + "step": 10713 + }, + { + "epoch": 2.8972417522985396, + "grad_norm": 3.09375, + "learning_rate": 0.009136762750057847, + "loss": 3.0833, + "mean_token_accuracy": 0.4005689322948456, + "num_tokens": 5476867445.0, + "step": 10714 + }, + { + "epoch": 2.8975121687398593, + "grad_norm": 3.28125, + "learning_rate": 0.009135220443509737, + "loss": 3.0686, + "mean_token_accuracy": 0.41312360763549805, + "num_tokens": 5477391714.0, + "step": 10715 + }, + { + "epoch": 2.897782585181179, + "grad_norm": 2.90625, + "learning_rate": 0.00913367819417855, + "loss": 3.0616, + "mean_token_accuracy": 0.4184231758117676, + "num_tokens": 5477903251.0, + "step": 10716 + }, + { + "epoch": 2.8980530016224986, + "grad_norm": 2.71875, + "learning_rate": 0.0091321360021116, + "loss": 3.0164, + "mean_token_accuracy": 0.41235125064849854, + "num_tokens": 5478427506.0, + "step": 10717 + }, + { + "epoch": 2.898323418063818, + "grad_norm": 3.296875, + "learning_rate": 0.009130593867356216, + "loss": 3.123, + "mean_token_accuracy": 0.39519640803337097, + "num_tokens": 5478951783.0, + "step": 10718 + }, + { + "epoch": 2.898593834505138, + "grad_norm": 2.625, + "learning_rate": 0.009129051789959708, + "loss": 3.2099, + "mean_token_accuracy": 0.415309876203537, + "num_tokens": 5479433487.0, + "step": 10719 + }, + { + "epoch": 2.8988642509464575, + "grad_norm": 2.96875, + "learning_rate": 0.00912750976996939, + "loss": 3.093, + "mean_token_accuracy": 0.4258548617362976, + "num_tokens": 5479896144.0, + "step": 10720 + }, + { + "epoch": 2.899134667387777, + "grad_norm": 2.5625, + "learning_rate": 0.009125967807432578, + "loss": 3.1687, + "mean_token_accuracy": 0.4010965824127197, + "num_tokens": 5480420416.0, + "step": 10721 + }, + { + "epoch": 2.899405083829097, + "grad_norm": 3.40625, + "learning_rate": 0.00912442590239659, + "loss": 2.9412, + "mean_token_accuracy": 0.43900829553604126, + "num_tokens": 5480936092.0, + "step": 10722 + }, + { + "epoch": 2.8996755002704164, + "grad_norm": 2.578125, + "learning_rate": 0.009122884054908726, + "loss": 2.8868, + "mean_token_accuracy": 0.40520596504211426, + "num_tokens": 5481460299.0, + "step": 10723 + }, + { + "epoch": 2.899945916711736, + "grad_norm": 4.09375, + "learning_rate": 0.009121342265016303, + "loss": 3.1529, + "mean_token_accuracy": 0.36982929706573486, + "num_tokens": 5481984531.0, + "step": 10724 + }, + { + "epoch": 2.9002163331530557, + "grad_norm": 2.59375, + "learning_rate": 0.009119800532766625, + "loss": 3.055, + "mean_token_accuracy": 0.41588157415390015, + "num_tokens": 5482460345.0, + "step": 10725 + }, + { + "epoch": 2.9004867495943754, + "grad_norm": 3.90625, + "learning_rate": 0.009118258858206992, + "loss": 3.3513, + "mean_token_accuracy": 0.36364299058914185, + "num_tokens": 5482984516.0, + "step": 10726 + }, + { + "epoch": 2.900757166035695, + "grad_norm": 2.59375, + "learning_rate": 0.009116717241384712, + "loss": 3.1339, + "mean_token_accuracy": 0.4135070741176605, + "num_tokens": 5483457792.0, + "step": 10727 + }, + { + "epoch": 2.9010275824770146, + "grad_norm": 3.140625, + "learning_rate": 0.009115175682347088, + "loss": 3.0591, + "mean_token_accuracy": 0.3778782784938812, + "num_tokens": 5483981999.0, + "step": 10728 + }, + { + "epoch": 2.9012979989183343, + "grad_norm": 2.75, + "learning_rate": 0.009113634181141415, + "loss": 3.0485, + "mean_token_accuracy": 0.42307305335998535, + "num_tokens": 5484449248.0, + "step": 10729 + }, + { + "epoch": 2.901568415359654, + "grad_norm": 3.4375, + "learning_rate": 0.009112092737814994, + "loss": 3.0743, + "mean_token_accuracy": 0.41203948855400085, + "num_tokens": 5484973502.0, + "step": 10730 + }, + { + "epoch": 2.9018388318009736, + "grad_norm": 25.0, + "learning_rate": 0.00911055135241512, + "loss": 11.6104, + "mean_token_accuracy": 0.04066265746951103, + "num_tokens": 5485497615.0, + "step": 10731 + }, + { + "epoch": 2.902109248242293, + "grad_norm": 6.59375, + "learning_rate": 0.009109010024989084, + "loss": 3.2898, + "mean_token_accuracy": 0.35930898785591125, + "num_tokens": 5486021864.0, + "step": 10732 + }, + { + "epoch": 2.902379664683613, + "grad_norm": 2.515625, + "learning_rate": 0.009107468755584183, + "loss": 3.143, + "mean_token_accuracy": 0.4067033529281616, + "num_tokens": 5486517147.0, + "step": 10733 + }, + { + "epoch": 2.9026500811249325, + "grad_norm": 3.03125, + "learning_rate": 0.009105927544247707, + "loss": 2.9952, + "mean_token_accuracy": 0.4230559468269348, + "num_tokens": 5486984511.0, + "step": 10734 + }, + { + "epoch": 2.902920497566252, + "grad_norm": 2.671875, + "learning_rate": 0.009104386391026942, + "loss": 3.028, + "mean_token_accuracy": 0.38941311836242676, + "num_tokens": 5487508683.0, + "step": 10735 + }, + { + "epoch": 2.903190914007572, + "grad_norm": 39.5, + "learning_rate": 0.00910284529596918, + "loss": 3.2861, + "mean_token_accuracy": 0.38318729400634766, + "num_tokens": 5488032829.0, + "step": 10736 + }, + { + "epoch": 2.9034613304488914, + "grad_norm": 4.5, + "learning_rate": 0.0091013042591217, + "loss": 3.1401, + "mean_token_accuracy": 0.3941464126110077, + "num_tokens": 5488557114.0, + "step": 10737 + }, + { + "epoch": 2.903731746890211, + "grad_norm": 2.265625, + "learning_rate": 0.009099763280531793, + "loss": 3.211, + "mean_token_accuracy": 0.39662671089172363, + "num_tokens": 5489045993.0, + "step": 10738 + }, + { + "epoch": 2.9040021633315307, + "grad_norm": 2.359375, + "learning_rate": 0.009098222360246737, + "loss": 2.9211, + "mean_token_accuracy": 0.41264355182647705, + "num_tokens": 5489570260.0, + "step": 10739 + }, + { + "epoch": 2.90427257977285, + "grad_norm": 3.25, + "learning_rate": 0.009096681498313807, + "loss": 3.2574, + "mean_token_accuracy": 0.39676493406295776, + "num_tokens": 5490094531.0, + "step": 10740 + }, + { + "epoch": 2.90454299621417, + "grad_norm": 3.828125, + "learning_rate": 0.00909514069478029, + "loss": 3.0844, + "mean_token_accuracy": 0.41206175088882446, + "num_tokens": 5490618799.0, + "step": 10741 + }, + { + "epoch": 2.904813412655489, + "grad_norm": 2.546875, + "learning_rate": 0.009093599949693457, + "loss": 2.9369, + "mean_token_accuracy": 0.4319078028202057, + "num_tokens": 5491083680.0, + "step": 10742 + }, + { + "epoch": 2.9050838290968093, + "grad_norm": 2.46875, + "learning_rate": 0.009092059263100583, + "loss": 3.1018, + "mean_token_accuracy": 0.4064585268497467, + "num_tokens": 5491553939.0, + "step": 10743 + }, + { + "epoch": 2.9053542455381285, + "grad_norm": 2.96875, + "learning_rate": 0.009090518635048943, + "loss": 2.9822, + "mean_token_accuracy": 0.4099118709564209, + "num_tokens": 5492045022.0, + "step": 10744 + }, + { + "epoch": 2.9056246619794486, + "grad_norm": 2.671875, + "learning_rate": 0.009088978065585809, + "loss": 2.975, + "mean_token_accuracy": 0.4200330376625061, + "num_tokens": 5492569290.0, + "step": 10745 + }, + { + "epoch": 2.9058950784207678, + "grad_norm": 2.828125, + "learning_rate": 0.009087437554758443, + "loss": 2.9193, + "mean_token_accuracy": 0.4229693114757538, + "num_tokens": 5493058722.0, + "step": 10746 + }, + { + "epoch": 2.906165494862088, + "grad_norm": 2.359375, + "learning_rate": 0.00908589710261412, + "loss": 2.906, + "mean_token_accuracy": 0.4278344213962555, + "num_tokens": 5493582935.0, + "step": 10747 + }, + { + "epoch": 2.906435911303407, + "grad_norm": 3.421875, + "learning_rate": 0.009084356709200101, + "loss": 3.2054, + "mean_token_accuracy": 0.38985365629196167, + "num_tokens": 5494107158.0, + "step": 10748 + }, + { + "epoch": 2.906706327744727, + "grad_norm": 5.59375, + "learning_rate": 0.009082816374563653, + "loss": 2.7578, + "mean_token_accuracy": 0.4605865776538849, + "num_tokens": 5494631291.0, + "step": 10749 + }, + { + "epoch": 2.9069767441860463, + "grad_norm": 2.09375, + "learning_rate": 0.009081276098752039, + "loss": 3.0729, + "mean_token_accuracy": 0.40897083282470703, + "num_tokens": 5495155505.0, + "step": 10750 + }, + { + "epoch": 2.907247160627366, + "grad_norm": 18.0, + "learning_rate": 0.009079735881812516, + "loss": 11.9581, + "mean_token_accuracy": 0.0, + "num_tokens": 5495679649.0, + "step": 10751 + }, + { + "epoch": 2.9075175770686856, + "grad_norm": 8.0, + "learning_rate": 0.009078195723792338, + "loss": 3.3966, + "mean_token_accuracy": 0.3642253279685974, + "num_tokens": 5496203926.0, + "step": 10752 + }, + { + "epoch": 2.9077879935100053, + "grad_norm": 2.1875, + "learning_rate": 0.009076655624738772, + "loss": 3.0611, + "mean_token_accuracy": 0.4209912121295929, + "num_tokens": 5496684912.0, + "step": 10753 + }, + { + "epoch": 2.908058409951325, + "grad_norm": 2.078125, + "learning_rate": 0.00907511558469907, + "loss": 3.0636, + "mean_token_accuracy": 0.4061492085456848, + "num_tokens": 5497209108.0, + "step": 10754 + }, + { + "epoch": 2.9083288263926446, + "grad_norm": 2.171875, + "learning_rate": 0.009073575603720477, + "loss": 3.0049, + "mean_token_accuracy": 0.384439617395401, + "num_tokens": 5497733300.0, + "step": 10755 + }, + { + "epoch": 2.908599242833964, + "grad_norm": 2.296875, + "learning_rate": 0.00907203568185025, + "loss": 2.8752, + "mean_token_accuracy": 0.42457085847854614, + "num_tokens": 5498228911.0, + "step": 10756 + }, + { + "epoch": 2.908869659275284, + "grad_norm": 2.53125, + "learning_rate": 0.009070495819135643, + "loss": 3.0865, + "mean_token_accuracy": 0.4021252393722534, + "num_tokens": 5498753105.0, + "step": 10757 + }, + { + "epoch": 2.9091400757166035, + "grad_norm": 3.8125, + "learning_rate": 0.009068956015623892, + "loss": 3.204, + "mean_token_accuracy": 0.3939080238342285, + "num_tokens": 5499277383.0, + "step": 10758 + }, + { + "epoch": 2.909410492157923, + "grad_norm": 2.828125, + "learning_rate": 0.009067416271362254, + "loss": 2.9787, + "mean_token_accuracy": 0.4099493622779846, + "num_tokens": 5499801543.0, + "step": 10759 + }, + { + "epoch": 2.9096809085992428, + "grad_norm": 3.78125, + "learning_rate": 0.009065876586397965, + "loss": 3.1014, + "mean_token_accuracy": 0.4083939790725708, + "num_tokens": 5500321401.0, + "step": 10760 + }, + { + "epoch": 2.9099513250405624, + "grad_norm": 3.421875, + "learning_rate": 0.009064336960778272, + "loss": 3.2645, + "mean_token_accuracy": 0.4144597053527832, + "num_tokens": 5500770350.0, + "step": 10761 + }, + { + "epoch": 2.910221741481882, + "grad_norm": 3.0, + "learning_rate": 0.009062797394550416, + "loss": 3.04, + "mean_token_accuracy": 0.4000072479248047, + "num_tokens": 5501294624.0, + "step": 10762 + }, + { + "epoch": 2.9104921579232017, + "grad_norm": 3.03125, + "learning_rate": 0.00906125788776163, + "loss": 2.9801, + "mean_token_accuracy": 0.415412038564682, + "num_tokens": 5501818881.0, + "step": 10763 + }, + { + "epoch": 2.9107625743645213, + "grad_norm": 3.203125, + "learning_rate": 0.009059718440459153, + "loss": 3.064, + "mean_token_accuracy": 0.4094266891479492, + "num_tokens": 5502343138.0, + "step": 10764 + }, + { + "epoch": 2.911032990805841, + "grad_norm": 2.984375, + "learning_rate": 0.009058179052690223, + "loss": 3.1493, + "mean_token_accuracy": 0.41677868366241455, + "num_tokens": 5502808242.0, + "step": 10765 + }, + { + "epoch": 2.9113034072471606, + "grad_norm": 2.921875, + "learning_rate": 0.009056639724502064, + "loss": 2.7815, + "mean_token_accuracy": 0.42409011721611023, + "num_tokens": 5503332409.0, + "step": 10766 + }, + { + "epoch": 2.9115738236884803, + "grad_norm": 3.25, + "learning_rate": 0.00905510045594192, + "loss": 3.0489, + "mean_token_accuracy": 0.39926087856292725, + "num_tokens": 5503856596.0, + "step": 10767 + }, + { + "epoch": 2.9118442401298, + "grad_norm": 3.578125, + "learning_rate": 0.009053561247057014, + "loss": 2.9865, + "mean_token_accuracy": 0.3738585114479065, + "num_tokens": 5504380859.0, + "step": 10768 + }, + { + "epoch": 2.9121146565711196, + "grad_norm": 3.015625, + "learning_rate": 0.009052022097894567, + "loss": 2.7334, + "mean_token_accuracy": 0.43401074409484863, + "num_tokens": 5504905042.0, + "step": 10769 + }, + { + "epoch": 2.912385073012439, + "grad_norm": 3.0625, + "learning_rate": 0.009050483008501815, + "loss": 2.9848, + "mean_token_accuracy": 0.4028341472148895, + "num_tokens": 5505429319.0, + "step": 10770 + }, + { + "epoch": 2.912655489453759, + "grad_norm": 60.0, + "learning_rate": 0.009048943978925977, + "loss": 13.6664, + "mean_token_accuracy": 0.009564220905303955, + "num_tokens": 5505953518.0, + "step": 10771 + }, + { + "epoch": 2.9129259058950785, + "grad_norm": 6.3125, + "learning_rate": 0.009047405009214271, + "loss": 3.4207, + "mean_token_accuracy": 0.38489067554473877, + "num_tokens": 5506477698.0, + "step": 10772 + }, + { + "epoch": 2.913196322336398, + "grad_norm": 2.5625, + "learning_rate": 0.009045866099413928, + "loss": 3.1592, + "mean_token_accuracy": 0.4049871861934662, + "num_tokens": 5506974499.0, + "step": 10773 + }, + { + "epoch": 2.913466738777718, + "grad_norm": 3.203125, + "learning_rate": 0.00904432724957216, + "loss": 3.0108, + "mean_token_accuracy": 0.39663559198379517, + "num_tokens": 5507498781.0, + "step": 10774 + }, + { + "epoch": 2.9137371552190374, + "grad_norm": 3.28125, + "learning_rate": 0.009042788459736177, + "loss": 3.3604, + "mean_token_accuracy": 0.38458287715911865, + "num_tokens": 5508022976.0, + "step": 10775 + }, + { + "epoch": 2.914007571660357, + "grad_norm": 2.109375, + "learning_rate": 0.009041249729953205, + "loss": 2.7515, + "mean_token_accuracy": 0.4217371642589569, + "num_tokens": 5508540710.0, + "step": 10776 + }, + { + "epoch": 2.9142779881016767, + "grad_norm": 2.390625, + "learning_rate": 0.00903971106027045, + "loss": 2.9306, + "mean_token_accuracy": 0.40785807371139526, + "num_tokens": 5509064964.0, + "step": 10777 + }, + { + "epoch": 2.9145484045429964, + "grad_norm": 2.953125, + "learning_rate": 0.00903817245073512, + "loss": 3.1712, + "mean_token_accuracy": 0.3839753568172455, + "num_tokens": 5509589169.0, + "step": 10778 + }, + { + "epoch": 2.914818820984316, + "grad_norm": 2.828125, + "learning_rate": 0.009036633901394435, + "loss": 2.9974, + "mean_token_accuracy": 0.42147767543792725, + "num_tokens": 5510113415.0, + "step": 10779 + }, + { + "epoch": 2.9150892374256356, + "grad_norm": 2.6875, + "learning_rate": 0.009035095412295595, + "loss": 3.0376, + "mean_token_accuracy": 0.3999604284763336, + "num_tokens": 5510617319.0, + "step": 10780 + }, + { + "epoch": 2.915359653866955, + "grad_norm": 2.71875, + "learning_rate": 0.009033556983485803, + "loss": 2.9307, + "mean_token_accuracy": 0.40943628549575806, + "num_tokens": 5511141576.0, + "step": 10781 + }, + { + "epoch": 2.915630070308275, + "grad_norm": 2.40625, + "learning_rate": 0.00903201861501227, + "loss": 3.1137, + "mean_token_accuracy": 0.40892893075942993, + "num_tokens": 5511665856.0, + "step": 10782 + }, + { + "epoch": 2.915900486749594, + "grad_norm": 3.3125, + "learning_rate": 0.009030480306922188, + "loss": 3.3031, + "mean_token_accuracy": 0.3802465796470642, + "num_tokens": 5512190124.0, + "step": 10783 + }, + { + "epoch": 2.916170903190914, + "grad_norm": 2.609375, + "learning_rate": 0.009028942059262766, + "loss": 2.9731, + "mean_token_accuracy": 0.4081421196460724, + "num_tokens": 5512714391.0, + "step": 10784 + }, + { + "epoch": 2.9164413196322334, + "grad_norm": 3.328125, + "learning_rate": 0.009027403872081197, + "loss": 2.8172, + "mean_token_accuracy": 0.41279366612434387, + "num_tokens": 5513208841.0, + "step": 10785 + }, + { + "epoch": 2.9167117360735535, + "grad_norm": 2.171875, + "learning_rate": 0.009025865745424674, + "loss": 3.0336, + "mean_token_accuracy": 0.4139212369918823, + "num_tokens": 5513733075.0, + "step": 10786 + }, + { + "epoch": 2.9169821525148727, + "grad_norm": 3.234375, + "learning_rate": 0.009024327679340402, + "loss": 3.1261, + "mean_token_accuracy": 0.41150718927383423, + "num_tokens": 5514257323.0, + "step": 10787 + }, + { + "epoch": 2.917252568956193, + "grad_norm": 3.0, + "learning_rate": 0.009022789673875566, + "loss": 3.0599, + "mean_token_accuracy": 0.4172402620315552, + "num_tokens": 5514781529.0, + "step": 10788 + }, + { + "epoch": 2.917522985397512, + "grad_norm": 2.875, + "learning_rate": 0.009021251729077355, + "loss": 3.1327, + "mean_token_accuracy": 0.3976193368434906, + "num_tokens": 5515305624.0, + "step": 10789 + }, + { + "epoch": 2.917793401838832, + "grad_norm": 3.046875, + "learning_rate": 0.009019713844992963, + "loss": 3.3311, + "mean_token_accuracy": 0.3896925449371338, + "num_tokens": 5515829893.0, + "step": 10790 + }, + { + "epoch": 2.9180638182801513, + "grad_norm": 168.0, + "learning_rate": 0.00901817602166957, + "loss": 26.8554, + "mean_token_accuracy": 2.7998210498481058e-05, + "num_tokens": 5516311938.0, + "step": 10791 + }, + { + "epoch": 2.918334234721471, + "grad_norm": 5.5, + "learning_rate": 0.009016638259154369, + "loss": 3.3411, + "mean_token_accuracy": 0.3712756037712097, + "num_tokens": 5516836135.0, + "step": 10792 + }, + { + "epoch": 2.9186046511627906, + "grad_norm": 10.9375, + "learning_rate": 0.009015100557494535, + "loss": 3.1581, + "mean_token_accuracy": 0.39651551842689514, + "num_tokens": 5517360411.0, + "step": 10793 + }, + { + "epoch": 2.91887506760411, + "grad_norm": 2.25, + "learning_rate": 0.009013562916737257, + "loss": 3.1784, + "mean_token_accuracy": 0.39897051453590393, + "num_tokens": 5517854590.0, + "step": 10794 + }, + { + "epoch": 2.91914548404543, + "grad_norm": 2.578125, + "learning_rate": 0.009012025336929706, + "loss": 3.0433, + "mean_token_accuracy": 0.41044050455093384, + "num_tokens": 5518341504.0, + "step": 10795 + }, + { + "epoch": 2.9194159004867495, + "grad_norm": 2.609375, + "learning_rate": 0.009010487818119068, + "loss": 2.8926, + "mean_token_accuracy": 0.4151151478290558, + "num_tokens": 5518810959.0, + "step": 10796 + }, + { + "epoch": 2.919686316928069, + "grad_norm": 3.140625, + "learning_rate": 0.009008950360352512, + "loss": 3.1442, + "mean_token_accuracy": 0.3997444808483124, + "num_tokens": 5519335227.0, + "step": 10797 + }, + { + "epoch": 2.9199567333693888, + "grad_norm": 3.015625, + "learning_rate": 0.009007412963677216, + "loss": 3.025, + "mean_token_accuracy": 0.4191358685493469, + "num_tokens": 5519831330.0, + "step": 10798 + }, + { + "epoch": 2.9202271498107084, + "grad_norm": 3.1875, + "learning_rate": 0.009005875628140347, + "loss": 3.0093, + "mean_token_accuracy": 0.399009108543396, + "num_tokens": 5520355404.0, + "step": 10799 + }, + { + "epoch": 2.920497566252028, + "grad_norm": 2.625, + "learning_rate": 0.009004338353789084, + "loss": 3.0054, + "mean_token_accuracy": 0.43572333455085754, + "num_tokens": 5520816086.0, + "step": 10800 + }, + { + "epoch": 2.9207679826933477, + "grad_norm": 3.40625, + "learning_rate": 0.009002801140670582, + "loss": 3.1686, + "mean_token_accuracy": 0.3979509472846985, + "num_tokens": 5521340354.0, + "step": 10801 + }, + { + "epoch": 2.9210383991346673, + "grad_norm": 2.96875, + "learning_rate": 0.009001263988832019, + "loss": 3.0778, + "mean_token_accuracy": 0.4598475992679596, + "num_tokens": 5521712971.0, + "step": 10802 + }, + { + "epoch": 2.921308815575987, + "grad_norm": 2.3125, + "learning_rate": 0.008999726898320555, + "loss": 2.8276, + "mean_token_accuracy": 0.43726566433906555, + "num_tokens": 5522237221.0, + "step": 10803 + }, + { + "epoch": 2.9215792320173066, + "grad_norm": 3.515625, + "learning_rate": 0.008998189869183349, + "loss": 2.9154, + "mean_token_accuracy": 0.39756661653518677, + "num_tokens": 5522761423.0, + "step": 10804 + }, + { + "epoch": 2.9218496484586263, + "grad_norm": 3.4375, + "learning_rate": 0.008996652901467566, + "loss": 3.1697, + "mean_token_accuracy": 0.4021647572517395, + "num_tokens": 5523285648.0, + "step": 10805 + }, + { + "epoch": 2.922120064899946, + "grad_norm": 4.0, + "learning_rate": 0.008995115995220364, + "loss": 2.9814, + "mean_token_accuracy": 0.41414332389831543, + "num_tokens": 5523809822.0, + "step": 10806 + }, + { + "epoch": 2.9223904813412656, + "grad_norm": 2.71875, + "learning_rate": 0.008993579150488897, + "loss": 2.8322, + "mean_token_accuracy": 0.42576512694358826, + "num_tokens": 5524333889.0, + "step": 10807 + }, + { + "epoch": 2.922660897782585, + "grad_norm": 2.921875, + "learning_rate": 0.008992042367320326, + "loss": 2.976, + "mean_token_accuracy": 0.4134133458137512, + "num_tokens": 5524858164.0, + "step": 10808 + }, + { + "epoch": 2.922931314223905, + "grad_norm": 3.015625, + "learning_rate": 0.008990505645761798, + "loss": 2.9925, + "mean_token_accuracy": 0.3944035768508911, + "num_tokens": 5525382323.0, + "step": 10809 + }, + { + "epoch": 2.9232017306652245, + "grad_norm": 2.640625, + "learning_rate": 0.008988968985860466, + "loss": 3.0538, + "mean_token_accuracy": 0.39860326051712036, + "num_tokens": 5525846496.0, + "step": 10810 + }, + { + "epoch": 2.923472147106544, + "grad_norm": 2.9375, + "learning_rate": 0.008987432387663481, + "loss": 11.5811, + "mean_token_accuracy": 0.0, + "num_tokens": 5526369639.0, + "step": 10811 + }, + { + "epoch": 2.9237425635478638, + "grad_norm": 9.125, + "learning_rate": 0.008985895851217987, + "loss": 3.2719, + "mean_token_accuracy": 0.3819946348667145, + "num_tokens": 5526893843.0, + "step": 10812 + }, + { + "epoch": 2.9240129799891834, + "grad_norm": 2.53125, + "learning_rate": 0.008984359376571131, + "loss": 2.8092, + "mean_token_accuracy": 0.4181938171386719, + "num_tokens": 5527418122.0, + "step": 10813 + }, + { + "epoch": 2.924283396430503, + "grad_norm": 3.125, + "learning_rate": 0.008982822963770062, + "loss": 3.2496, + "mean_token_accuracy": 0.4116821587085724, + "num_tokens": 5527883359.0, + "step": 10814 + }, + { + "epoch": 2.9245538128718227, + "grad_norm": 3.84375, + "learning_rate": 0.008981286612861912, + "loss": 3.3134, + "mean_token_accuracy": 0.3975212275981903, + "num_tokens": 5528346720.0, + "step": 10815 + }, + { + "epoch": 2.9248242293131423, + "grad_norm": 2.828125, + "learning_rate": 0.008979750323893827, + "loss": 3.138, + "mean_token_accuracy": 0.3941015601158142, + "num_tokens": 5528870986.0, + "step": 10816 + }, + { + "epoch": 2.925094645754462, + "grad_norm": 5.5, + "learning_rate": 0.008978214096912945, + "loss": 3.0292, + "mean_token_accuracy": 0.4540575444698334, + "num_tokens": 5529389014.0, + "step": 10817 + }, + { + "epoch": 2.9253650621957816, + "grad_norm": 2.15625, + "learning_rate": 0.008976677931966395, + "loss": 2.9466, + "mean_token_accuracy": 0.40174633264541626, + "num_tokens": 5529876208.0, + "step": 10818 + }, + { + "epoch": 2.9256354786371013, + "grad_norm": 2.796875, + "learning_rate": 0.008975141829101321, + "loss": 2.9821, + "mean_token_accuracy": 0.4050469696521759, + "num_tokens": 5530374711.0, + "step": 10819 + }, + { + "epoch": 2.925905895078421, + "grad_norm": 3.125, + "learning_rate": 0.008973605788364846, + "loss": 3.0447, + "mean_token_accuracy": 0.39785319566726685, + "num_tokens": 5530898925.0, + "step": 10820 + }, + { + "epoch": 2.9261763115197406, + "grad_norm": 2.953125, + "learning_rate": 0.008972069809804107, + "loss": 3.15, + "mean_token_accuracy": 0.43552833795547485, + "num_tokens": 5531294922.0, + "step": 10821 + }, + { + "epoch": 2.9264467279610598, + "grad_norm": 2.71875, + "learning_rate": 0.00897053389346623, + "loss": 3.0619, + "mean_token_accuracy": 0.4210137128829956, + "num_tokens": 5531819168.0, + "step": 10822 + }, + { + "epoch": 2.92671714440238, + "grad_norm": 3.15625, + "learning_rate": 0.008968998039398345, + "loss": 3.0339, + "mean_token_accuracy": 0.41875699162483215, + "num_tokens": 5532343356.0, + "step": 10823 + }, + { + "epoch": 2.926987560843699, + "grad_norm": 3.421875, + "learning_rate": 0.008967462247647566, + "loss": 3.2475, + "mean_token_accuracy": 0.4076138138771057, + "num_tokens": 5532867609.0, + "step": 10824 + }, + { + "epoch": 2.927257977285019, + "grad_norm": 3.703125, + "learning_rate": 0.008965926518261027, + "loss": 3.0159, + "mean_token_accuracy": 0.412352979183197, + "num_tokens": 5533391815.0, + "step": 10825 + }, + { + "epoch": 2.9275283937263383, + "grad_norm": 3.078125, + "learning_rate": 0.008964390851285845, + "loss": 3.0432, + "mean_token_accuracy": 0.4162231981754303, + "num_tokens": 5533916075.0, + "step": 10826 + }, + { + "epoch": 2.9277988101676584, + "grad_norm": 3.40625, + "learning_rate": 0.008962855246769133, + "loss": 3.0323, + "mean_token_accuracy": 0.4043448567390442, + "num_tokens": 5534440306.0, + "step": 10827 + }, + { + "epoch": 2.9280692266089776, + "grad_norm": 3.203125, + "learning_rate": 0.008961319704758018, + "loss": 3.0014, + "mean_token_accuracy": 0.4110187292098999, + "num_tokens": 5534964510.0, + "step": 10828 + }, + { + "epoch": 2.9283396430502977, + "grad_norm": 3.125, + "learning_rate": 0.008959784225299609, + "loss": 3.2516, + "mean_token_accuracy": 0.3949728012084961, + "num_tokens": 5535488662.0, + "step": 10829 + }, + { + "epoch": 2.928610059491617, + "grad_norm": 2.75, + "learning_rate": 0.008958248808441016, + "loss": 3.0427, + "mean_token_accuracy": 0.4103699326515198, + "num_tokens": 5536012837.0, + "step": 10830 + }, + { + "epoch": 2.928880475932937, + "grad_norm": 9.625, + "learning_rate": 0.008956713454229359, + "loss": 8.7366, + "mean_token_accuracy": 0.012403249740600586, + "num_tokens": 5536537049.0, + "step": 10831 + }, + { + "epoch": 2.929150892374256, + "grad_norm": 6.125, + "learning_rate": 0.00895517816271174, + "loss": 3.1368, + "mean_token_accuracy": 0.40596675872802734, + "num_tokens": 5537061304.0, + "step": 10832 + }, + { + "epoch": 2.929421308815576, + "grad_norm": 2.171875, + "learning_rate": 0.008953642933935269, + "loss": 2.9624, + "mean_token_accuracy": 0.40603184700012207, + "num_tokens": 5537585535.0, + "step": 10833 + }, + { + "epoch": 2.9296917252568955, + "grad_norm": 3.765625, + "learning_rate": 0.008952107767947049, + "loss": 3.3494, + "mean_token_accuracy": 0.3886333405971527, + "num_tokens": 5538054890.0, + "step": 10834 + }, + { + "epoch": 2.929962141698215, + "grad_norm": 3.5, + "learning_rate": 0.008950572664794185, + "loss": 3.2121, + "mean_token_accuracy": 0.39603596925735474, + "num_tokens": 5538546282.0, + "step": 10835 + }, + { + "epoch": 2.9302325581395348, + "grad_norm": 3.828125, + "learning_rate": 0.008949037624523782, + "loss": 3.3059, + "mean_token_accuracy": 0.3661468029022217, + "num_tokens": 5539066022.0, + "step": 10836 + }, + { + "epoch": 2.9305029745808544, + "grad_norm": 4.625, + "learning_rate": 0.008947502647182935, + "loss": 3.0015, + "mean_token_accuracy": 0.439750611782074, + "num_tokens": 5539590288.0, + "step": 10837 + }, + { + "epoch": 2.930773391022174, + "grad_norm": 2.421875, + "learning_rate": 0.008945967732818739, + "loss": 3.082, + "mean_token_accuracy": 0.38829195499420166, + "num_tokens": 5540109191.0, + "step": 10838 + }, + { + "epoch": 2.9310438074634937, + "grad_norm": 3.546875, + "learning_rate": 0.008944432881478297, + "loss": 3.0504, + "mean_token_accuracy": 0.4147666394710541, + "num_tokens": 5540633425.0, + "step": 10839 + }, + { + "epoch": 2.9313142239048133, + "grad_norm": 2.890625, + "learning_rate": 0.0089428980932087, + "loss": 2.9815, + "mean_token_accuracy": 0.4309718608856201, + "num_tokens": 5541157539.0, + "step": 10840 + }, + { + "epoch": 2.931584640346133, + "grad_norm": 3.90625, + "learning_rate": 0.008941363368057035, + "loss": 3.2938, + "mean_token_accuracy": 0.39473533630371094, + "num_tokens": 5541620798.0, + "step": 10841 + }, + { + "epoch": 2.9318550567874526, + "grad_norm": 3.25, + "learning_rate": 0.008939828706070399, + "loss": 3.2351, + "mean_token_accuracy": 0.3987216353416443, + "num_tokens": 5542062000.0, + "step": 10842 + }, + { + "epoch": 2.9321254732287723, + "grad_norm": 2.671875, + "learning_rate": 0.008938294107295878, + "loss": 2.774, + "mean_token_accuracy": 0.5001708269119263, + "num_tokens": 5542520902.0, + "step": 10843 + }, + { + "epoch": 2.932395889670092, + "grad_norm": 2.3125, + "learning_rate": 0.008936759571780552, + "loss": 3.0652, + "mean_token_accuracy": 0.4017222821712494, + "num_tokens": 5543045100.0, + "step": 10844 + }, + { + "epoch": 2.9326663061114115, + "grad_norm": 2.890625, + "learning_rate": 0.008935225099571516, + "loss": 2.9818, + "mean_token_accuracy": 0.4245590567588806, + "num_tokens": 5543569096.0, + "step": 10845 + }, + { + "epoch": 2.932936722552731, + "grad_norm": 2.71875, + "learning_rate": 0.008933690690715845, + "loss": 2.9993, + "mean_token_accuracy": 0.4328179955482483, + "num_tokens": 5544027764.0, + "step": 10846 + }, + { + "epoch": 2.933207138994051, + "grad_norm": 2.6875, + "learning_rate": 0.008932156345260615, + "loss": 2.9534, + "mean_token_accuracy": 0.41837701201438904, + "num_tokens": 5544552008.0, + "step": 10847 + }, + { + "epoch": 2.9334775554353705, + "grad_norm": 47.5, + "learning_rate": 0.008930622063252914, + "loss": 3.1812, + "mean_token_accuracy": 0.3896175026893616, + "num_tokens": 5545076033.0, + "step": 10848 + }, + { + "epoch": 2.93374797187669, + "grad_norm": 5.5, + "learning_rate": 0.008929087844739813, + "loss": 3.2878, + "mean_token_accuracy": 0.38767385482788086, + "num_tokens": 5545500553.0, + "step": 10849 + }, + { + "epoch": 2.9340183883180098, + "grad_norm": 1.7734375, + "learning_rate": 0.008927553689768386, + "loss": 3.1028, + "mean_token_accuracy": 0.39961695671081543, + "num_tokens": 5546024691.0, + "step": 10850 + }, + { + "epoch": 2.9342888047593294, + "grad_norm": 65.5, + "learning_rate": 0.008926019598385708, + "loss": 11.0591, + "mean_token_accuracy": 0.008465266786515713, + "num_tokens": 5546548912.0, + "step": 10851 + }, + { + "epoch": 2.934559221200649, + "grad_norm": 8.375, + "learning_rate": 0.00892448557063885, + "loss": 3.4343, + "mean_token_accuracy": 0.3717452883720398, + "num_tokens": 5547055722.0, + "step": 10852 + }, + { + "epoch": 2.9348296376419687, + "grad_norm": 2.9375, + "learning_rate": 0.008922951606574872, + "loss": 3.2314, + "mean_token_accuracy": 0.3758805990219116, + "num_tokens": 5547566971.0, + "step": 10853 + }, + { + "epoch": 2.9351000540832883, + "grad_norm": 3.0, + "learning_rate": 0.008921417706240851, + "loss": 3.2614, + "mean_token_accuracy": 0.3963279724121094, + "num_tokens": 5548091190.0, + "step": 10854 + }, + { + "epoch": 2.935370470524608, + "grad_norm": 2.4375, + "learning_rate": 0.008919883869683844, + "loss": 2.8372, + "mean_token_accuracy": 0.43346649408340454, + "num_tokens": 5548577900.0, + "step": 10855 + }, + { + "epoch": 2.9356408869659276, + "grad_norm": 2.390625, + "learning_rate": 0.008918350096950923, + "loss": 2.8613, + "mean_token_accuracy": 0.43162038922309875, + "num_tokens": 5549074339.0, + "step": 10856 + }, + { + "epoch": 2.9359113034072473, + "grad_norm": 2.515625, + "learning_rate": 0.00891681638808914, + "loss": 3.0112, + "mean_token_accuracy": 0.41795778274536133, + "num_tokens": 5549598583.0, + "step": 10857 + }, + { + "epoch": 2.936181719848567, + "grad_norm": 3.109375, + "learning_rate": 0.008915282743145554, + "loss": 3.0322, + "mean_token_accuracy": 0.41585731506347656, + "num_tokens": 5550073790.0, + "step": 10858 + }, + { + "epoch": 2.9364521362898865, + "grad_norm": 3.109375, + "learning_rate": 0.008913749162167228, + "loss": 2.9698, + "mean_token_accuracy": 0.4173058867454529, + "num_tokens": 5550580957.0, + "step": 10859 + }, + { + "epoch": 2.936722552731206, + "grad_norm": 3.03125, + "learning_rate": 0.008912215645201211, + "loss": 3.0784, + "mean_token_accuracy": 0.4137447476387024, + "num_tokens": 5551105113.0, + "step": 10860 + }, + { + "epoch": 2.936992969172526, + "grad_norm": 3.53125, + "learning_rate": 0.008910682192294557, + "loss": 3.098, + "mean_token_accuracy": 0.40825510025024414, + "num_tokens": 5551629216.0, + "step": 10861 + }, + { + "epoch": 2.9372633856138455, + "grad_norm": 3.0, + "learning_rate": 0.00890914880349432, + "loss": 3.0463, + "mean_token_accuracy": 0.4203854203224182, + "num_tokens": 5552153484.0, + "step": 10862 + }, + { + "epoch": 2.9375338020551647, + "grad_norm": 3.78125, + "learning_rate": 0.008907615478847546, + "loss": 3.1441, + "mean_token_accuracy": 0.4055411219596863, + "num_tokens": 5552617925.0, + "step": 10863 + }, + { + "epoch": 2.9378042184964848, + "grad_norm": 2.765625, + "learning_rate": 0.008906082218401279, + "loss": 3.0476, + "mean_token_accuracy": 0.4150490462779999, + "num_tokens": 5553142154.0, + "step": 10864 + }, + { + "epoch": 2.938074634937804, + "grad_norm": 3.234375, + "learning_rate": 0.008904549022202573, + "loss": 3.0666, + "mean_token_accuracy": 0.41199350357055664, + "num_tokens": 5553666430.0, + "step": 10865 + }, + { + "epoch": 2.938345051379124, + "grad_norm": 2.5625, + "learning_rate": 0.008903015890298462, + "loss": 3.0807, + "mean_token_accuracy": 0.39295825362205505, + "num_tokens": 5554190712.0, + "step": 10866 + }, + { + "epoch": 2.9386154678204432, + "grad_norm": 3.453125, + "learning_rate": 0.00890148282273599, + "loss": 2.9391, + "mean_token_accuracy": 0.3964419364929199, + "num_tokens": 5554714866.0, + "step": 10867 + }, + { + "epoch": 2.9388858842617633, + "grad_norm": 2.234375, + "learning_rate": 0.008899949819562198, + "loss": 2.9549, + "mean_token_accuracy": 0.4147716760635376, + "num_tokens": 5555239037.0, + "step": 10868 + }, + { + "epoch": 2.9391563007030825, + "grad_norm": 2.6875, + "learning_rate": 0.008898416880824121, + "loss": 3.1569, + "mean_token_accuracy": 0.3812355697154999, + "num_tokens": 5555763197.0, + "step": 10869 + }, + { + "epoch": 2.9394267171444026, + "grad_norm": 2.234375, + "learning_rate": 0.008896884006568791, + "loss": 3.0043, + "mean_token_accuracy": 0.42818140983581543, + "num_tokens": 5556287225.0, + "step": 10870 + }, + { + "epoch": 2.939697133585722, + "grad_norm": 3.296875, + "learning_rate": 0.00889535119684325, + "loss": 11.0776, + "mean_token_accuracy": 2.7759206204791553e-05, + "num_tokens": 5556806296.0, + "step": 10871 + }, + { + "epoch": 2.939967550027042, + "grad_norm": 6.28125, + "learning_rate": 0.008893818451694522, + "loss": 3.1766, + "mean_token_accuracy": 0.3981887400150299, + "num_tokens": 5557306418.0, + "step": 10872 + }, + { + "epoch": 2.940237966468361, + "grad_norm": 3.9375, + "learning_rate": 0.008892285771169633, + "loss": 3.1277, + "mean_token_accuracy": 0.4180424213409424, + "num_tokens": 5557809063.0, + "step": 10873 + }, + { + "epoch": 2.9405083829096808, + "grad_norm": 3.0, + "learning_rate": 0.008890753155315621, + "loss": 3.1225, + "mean_token_accuracy": 0.3883410692214966, + "num_tokens": 5558311573.0, + "step": 10874 + }, + { + "epoch": 2.9407787993510004, + "grad_norm": 3.359375, + "learning_rate": 0.008889220604179501, + "loss": 2.8561, + "mean_token_accuracy": 0.3915068209171295, + "num_tokens": 5558835763.0, + "step": 10875 + }, + { + "epoch": 2.94104921579232, + "grad_norm": 3.265625, + "learning_rate": 0.008887688117808304, + "loss": 2.9346, + "mean_token_accuracy": 0.4227415919303894, + "num_tokens": 5559360029.0, + "step": 10876 + }, + { + "epoch": 2.9413196322336397, + "grad_norm": 3.125, + "learning_rate": 0.008886155696249046, + "loss": 3.0902, + "mean_token_accuracy": 0.38484275341033936, + "num_tokens": 5559884307.0, + "step": 10877 + }, + { + "epoch": 2.9415900486749593, + "grad_norm": 2.609375, + "learning_rate": 0.008884623339548743, + "loss": 2.9997, + "mean_token_accuracy": 0.4153377413749695, + "num_tokens": 5560408514.0, + "step": 10878 + }, + { + "epoch": 2.941860465116279, + "grad_norm": 3.125, + "learning_rate": 0.008883091047754421, + "loss": 2.8933, + "mean_token_accuracy": 0.4172893166542053, + "num_tokens": 5560889348.0, + "step": 10879 + }, + { + "epoch": 2.9421308815575986, + "grad_norm": 2.734375, + "learning_rate": 0.008881558820913096, + "loss": 3.0148, + "mean_token_accuracy": 0.41268497705459595, + "num_tokens": 5561413609.0, + "step": 10880 + }, + { + "epoch": 2.9424012979989183, + "grad_norm": 2.796875, + "learning_rate": 0.008880026659071767, + "loss": 2.8816, + "mean_token_accuracy": 0.43015652894973755, + "num_tokens": 5561937685.0, + "step": 10881 + }, + { + "epoch": 2.942671714440238, + "grad_norm": 2.703125, + "learning_rate": 0.008878494562277461, + "loss": 3.0748, + "mean_token_accuracy": 0.4278087913990021, + "num_tokens": 5562414255.0, + "step": 10882 + }, + { + "epoch": 2.9429421308815575, + "grad_norm": 2.734375, + "learning_rate": 0.00887696253057718, + "loss": 3.1171, + "mean_token_accuracy": 0.4001912474632263, + "num_tokens": 5562925694.0, + "step": 10883 + }, + { + "epoch": 2.943212547322877, + "grad_norm": 2.796875, + "learning_rate": 0.00887543056401793, + "loss": 3.0929, + "mean_token_accuracy": 0.399664044380188, + "num_tokens": 5563449823.0, + "step": 10884 + }, + { + "epoch": 2.943482963764197, + "grad_norm": 2.78125, + "learning_rate": 0.008873898662646719, + "loss": 3.0401, + "mean_token_accuracy": 0.3948018550872803, + "num_tokens": 5563974000.0, + "step": 10885 + }, + { + "epoch": 2.9437533802055165, + "grad_norm": 2.734375, + "learning_rate": 0.008872366826510549, + "loss": 2.8836, + "mean_token_accuracy": 0.40404146909713745, + "num_tokens": 5564498089.0, + "step": 10886 + }, + { + "epoch": 2.944023796646836, + "grad_norm": 2.6875, + "learning_rate": 0.008870835055656421, + "loss": 3.2246, + "mean_token_accuracy": 0.40496766567230225, + "num_tokens": 5564976976.0, + "step": 10887 + }, + { + "epoch": 2.9442942130881558, + "grad_norm": 3.5625, + "learning_rate": 0.008869303350131337, + "loss": 3.1027, + "mean_token_accuracy": 0.3922605812549591, + "num_tokens": 5565501246.0, + "step": 10888 + }, + { + "epoch": 2.9445646295294754, + "grad_norm": 2.484375, + "learning_rate": 0.008867771709982293, + "loss": 3.0379, + "mean_token_accuracy": 0.4053604006767273, + "num_tokens": 5566025357.0, + "step": 10889 + }, + { + "epoch": 2.944835045970795, + "grad_norm": 3.5625, + "learning_rate": 0.008866240135256281, + "loss": 3.0881, + "mean_token_accuracy": 0.4157222509384155, + "num_tokens": 5566529890.0, + "step": 10890 + }, + { + "epoch": 2.9451054624121147, + "grad_norm": 118.0, + "learning_rate": 0.008864708626000296, + "loss": 10.2062, + "mean_token_accuracy": 0.0, + "num_tokens": 5567032403.0, + "step": 10891 + }, + { + "epoch": 2.9453758788534343, + "grad_norm": 5.1875, + "learning_rate": 0.008863177182261333, + "loss": 3.0255, + "mean_token_accuracy": 0.4063328802585602, + "num_tokens": 5567556549.0, + "step": 10892 + }, + { + "epoch": 2.945646295294754, + "grad_norm": 3.046875, + "learning_rate": 0.008861645804086373, + "loss": 2.8642, + "mean_token_accuracy": 0.42876607179641724, + "num_tokens": 5568080757.0, + "step": 10893 + }, + { + "epoch": 2.9459167117360736, + "grad_norm": 2.3125, + "learning_rate": 0.008860114491522414, + "loss": 2.992, + "mean_token_accuracy": 0.4246542453765869, + "num_tokens": 5568580183.0, + "step": 10894 + }, + { + "epoch": 2.9461871281773933, + "grad_norm": 4.0, + "learning_rate": 0.008858583244616434, + "loss": 2.9583, + "mean_token_accuracy": 0.4127323031425476, + "num_tokens": 5569047204.0, + "step": 10895 + }, + { + "epoch": 2.946457544618713, + "grad_norm": 2.359375, + "learning_rate": 0.008857052063415414, + "loss": 3.0164, + "mean_token_accuracy": 0.4138527512550354, + "num_tokens": 5569528469.0, + "step": 10896 + }, + { + "epoch": 2.9467279610600325, + "grad_norm": 4.15625, + "learning_rate": 0.00885552094796634, + "loss": 3.1881, + "mean_token_accuracy": 0.4209550619125366, + "num_tokens": 5569964411.0, + "step": 10897 + }, + { + "epoch": 2.946998377501352, + "grad_norm": 2.46875, + "learning_rate": 0.008853989898316192, + "loss": 2.742, + "mean_token_accuracy": 0.4150781035423279, + "num_tokens": 5570488682.0, + "step": 10898 + }, + { + "epoch": 2.947268793942672, + "grad_norm": 3.640625, + "learning_rate": 0.008852458914511942, + "loss": 2.8883, + "mean_token_accuracy": 0.4054557979106903, + "num_tokens": 5571012852.0, + "step": 10899 + }, + { + "epoch": 2.9475392103839915, + "grad_norm": 3.09375, + "learning_rate": 0.008850927996600573, + "loss": 2.8661, + "mean_token_accuracy": 0.42324143648147583, + "num_tokens": 5571474748.0, + "step": 10900 + }, + { + "epoch": 2.947809626825311, + "grad_norm": 2.734375, + "learning_rate": 0.008849397144629048, + "loss": 2.8248, + "mean_token_accuracy": 0.41506561636924744, + "num_tokens": 5571999020.0, + "step": 10901 + }, + { + "epoch": 2.9480800432666308, + "grad_norm": 3.71875, + "learning_rate": 0.008847866358644344, + "loss": 3.0204, + "mean_token_accuracy": 0.4172234833240509, + "num_tokens": 5572523207.0, + "step": 10902 + }, + { + "epoch": 2.9483504597079504, + "grad_norm": 3.46875, + "learning_rate": 0.008846335638693432, + "loss": 3.2696, + "mean_token_accuracy": 0.3969593644142151, + "num_tokens": 5573022263.0, + "step": 10903 + }, + { + "epoch": 2.9486208761492696, + "grad_norm": 3.40625, + "learning_rate": 0.008844804984823275, + "loss": 3.045, + "mean_token_accuracy": 0.4070112109184265, + "num_tokens": 5573499531.0, + "step": 10904 + }, + { + "epoch": 2.9488912925905897, + "grad_norm": 3.265625, + "learning_rate": 0.008843274397080837, + "loss": 3.0993, + "mean_token_accuracy": 0.4173371195793152, + "num_tokens": 5574023667.0, + "step": 10905 + }, + { + "epoch": 2.949161709031909, + "grad_norm": 4.53125, + "learning_rate": 0.008841743875513087, + "loss": 3.0503, + "mean_token_accuracy": 0.4200022518634796, + "num_tokens": 5574471916.0, + "step": 10906 + }, + { + "epoch": 2.949432125473229, + "grad_norm": 2.53125, + "learning_rate": 0.008840213420166975, + "loss": 2.9516, + "mean_token_accuracy": 0.420352041721344, + "num_tokens": 5574996126.0, + "step": 10907 + }, + { + "epoch": 2.949702541914548, + "grad_norm": 3.84375, + "learning_rate": 0.008838683031089475, + "loss": 2.8919, + "mean_token_accuracy": 0.4059748649597168, + "num_tokens": 5575520394.0, + "step": 10908 + }, + { + "epoch": 2.9499729583558683, + "grad_norm": 2.90625, + "learning_rate": 0.008837152708327531, + "loss": 2.9395, + "mean_token_accuracy": 0.4268814027309418, + "num_tokens": 5576000808.0, + "step": 10909 + }, + { + "epoch": 2.9502433747971875, + "grad_norm": 2.984375, + "learning_rate": 0.0088356224519281, + "loss": 2.8614, + "mean_token_accuracy": 0.4271862208843231, + "num_tokens": 5576505550.0, + "step": 10910 + }, + { + "epoch": 2.9505137912385075, + "grad_norm": 0.8671875, + "learning_rate": 0.008834092261938143, + "loss": 10.9637, + "mean_token_accuracy": 5.917019734624773e-06, + "num_tokens": 5577029833.0, + "step": 10911 + }, + { + "epoch": 2.9507842076798267, + "grad_norm": 6.96875, + "learning_rate": 0.008832562138404602, + "loss": 3.3834, + "mean_token_accuracy": 0.3691931664943695, + "num_tokens": 5577554115.0, + "step": 10912 + }, + { + "epoch": 2.951054624121147, + "grad_norm": 2.234375, + "learning_rate": 0.008831032081374425, + "loss": 3.0748, + "mean_token_accuracy": 0.43921324610710144, + "num_tokens": 5578013769.0, + "step": 10913 + }, + { + "epoch": 2.951325040562466, + "grad_norm": 3.25, + "learning_rate": 0.008829502090894571, + "loss": 3.149, + "mean_token_accuracy": 0.40922626852989197, + "num_tokens": 5578537930.0, + "step": 10914 + }, + { + "epoch": 2.9515954570037857, + "grad_norm": 2.828125, + "learning_rate": 0.00882797216701197, + "loss": 3.1113, + "mean_token_accuracy": 0.40812253952026367, + "num_tokens": 5579062163.0, + "step": 10915 + }, + { + "epoch": 2.9518658734451053, + "grad_norm": 3.09375, + "learning_rate": 0.00882644230977357, + "loss": 3.2738, + "mean_token_accuracy": 0.4051966667175293, + "num_tokens": 5579552792.0, + "step": 10916 + }, + { + "epoch": 2.952136289886425, + "grad_norm": 3.109375, + "learning_rate": 0.008824912519226315, + "loss": 3.0441, + "mean_token_accuracy": 0.40664541721343994, + "num_tokens": 5580076973.0, + "step": 10917 + }, + { + "epoch": 2.9524067063277446, + "grad_norm": 3.28125, + "learning_rate": 0.008823382795417138, + "loss": 3.1018, + "mean_token_accuracy": 0.4023803770542145, + "num_tokens": 5580601231.0, + "step": 10918 + }, + { + "epoch": 2.9526771227690642, + "grad_norm": 3.078125, + "learning_rate": 0.008821853138392979, + "loss": 3.2284, + "mean_token_accuracy": 0.3945583403110504, + "num_tokens": 5581125428.0, + "step": 10919 + }, + { + "epoch": 2.952947539210384, + "grad_norm": 3.484375, + "learning_rate": 0.008820323548200771, + "loss": 3.0654, + "mean_token_accuracy": 0.39158564805984497, + "num_tokens": 5581606815.0, + "step": 10920 + }, + { + "epoch": 2.9532179556517035, + "grad_norm": 2.796875, + "learning_rate": 0.00881879402488745, + "loss": 3.0679, + "mean_token_accuracy": 0.41423824429512024, + "num_tokens": 5582078660.0, + "step": 10921 + }, + { + "epoch": 2.953488372093023, + "grad_norm": 4.4375, + "learning_rate": 0.008817264568499935, + "loss": 3.1261, + "mean_token_accuracy": 0.3875710368156433, + "num_tokens": 5582602858.0, + "step": 10922 + }, + { + "epoch": 2.953758788534343, + "grad_norm": 2.421875, + "learning_rate": 0.008815735179085168, + "loss": 2.9223, + "mean_token_accuracy": 0.4198109209537506, + "num_tokens": 5583115781.0, + "step": 10923 + }, + { + "epoch": 2.9540292049756625, + "grad_norm": 3.28125, + "learning_rate": 0.008814205856690065, + "loss": 2.9439, + "mean_token_accuracy": 0.4124686121940613, + "num_tokens": 5583639935.0, + "step": 10924 + }, + { + "epoch": 2.954299621416982, + "grad_norm": 2.546875, + "learning_rate": 0.008812676601361558, + "loss": 3.057, + "mean_token_accuracy": 0.40822917222976685, + "num_tokens": 5584164103.0, + "step": 10925 + }, + { + "epoch": 2.9545700378583017, + "grad_norm": 3.109375, + "learning_rate": 0.008811147413146562, + "loss": 3.0633, + "mean_token_accuracy": 0.41620293259620667, + "num_tokens": 5584623800.0, + "step": 10926 + }, + { + "epoch": 2.9548404542996214, + "grad_norm": 3.234375, + "learning_rate": 0.008809618292092005, + "loss": 3.1846, + "mean_token_accuracy": 0.3924708962440491, + "num_tokens": 5585148037.0, + "step": 10927 + }, + { + "epoch": 2.955110870740941, + "grad_norm": 3.28125, + "learning_rate": 0.008808089238244797, + "loss": 3.0284, + "mean_token_accuracy": 0.39539483189582825, + "num_tokens": 5585672181.0, + "step": 10928 + }, + { + "epoch": 2.9553812871822607, + "grad_norm": 3.625, + "learning_rate": 0.00880656025165186, + "loss": 3.2298, + "mean_token_accuracy": 0.40511369705200195, + "num_tokens": 5586148199.0, + "step": 10929 + }, + { + "epoch": 2.9556517036235803, + "grad_norm": 3.28125, + "learning_rate": 0.008805031332360101, + "loss": 3.1324, + "mean_token_accuracy": 0.3875405788421631, + "num_tokens": 5586672195.0, + "step": 10930 + }, + { + "epoch": 2.9559221200649, + "grad_norm": 36.5, + "learning_rate": 0.008803502480416439, + "loss": 12.6843, + "mean_token_accuracy": 0.007861402817070484, + "num_tokens": 5587196436.0, + "step": 10931 + }, + { + "epoch": 2.9561925365062196, + "grad_norm": 7.53125, + "learning_rate": 0.00880197369586778, + "loss": 3.4611, + "mean_token_accuracy": 0.36893150210380554, + "num_tokens": 5587720718.0, + "step": 10932 + }, + { + "epoch": 2.9564629529475392, + "grad_norm": 1.96875, + "learning_rate": 0.008800444978761028, + "loss": 3.1138, + "mean_token_accuracy": 0.41033560037612915, + "num_tokens": 5588244950.0, + "step": 10933 + }, + { + "epoch": 2.956733369388859, + "grad_norm": 2.78125, + "learning_rate": 0.008798916329143097, + "loss": 3.0917, + "mean_token_accuracy": 0.4059787392616272, + "num_tokens": 5588769105.0, + "step": 10934 + }, + { + "epoch": 2.9570037858301785, + "grad_norm": 2.96875, + "learning_rate": 0.008797387747060886, + "loss": 3.268, + "mean_token_accuracy": 0.39359205961227417, + "num_tokens": 5589293371.0, + "step": 10935 + }, + { + "epoch": 2.957274202271498, + "grad_norm": 2.984375, + "learning_rate": 0.008795859232561291, + "loss": 3.1364, + "mean_token_accuracy": 0.38529932498931885, + "num_tokens": 5589817654.0, + "step": 10936 + }, + { + "epoch": 2.957544618712818, + "grad_norm": 3.328125, + "learning_rate": 0.008794330785691222, + "loss": 3.0998, + "mean_token_accuracy": 0.40968313813209534, + "num_tokens": 5590341847.0, + "step": 10937 + }, + { + "epoch": 2.9578150351541375, + "grad_norm": 2.640625, + "learning_rate": 0.008792802406497572, + "loss": 3.0534, + "mean_token_accuracy": 0.3970124423503876, + "num_tokens": 5590866040.0, + "step": 10938 + }, + { + "epoch": 2.958085451595457, + "grad_norm": 2.453125, + "learning_rate": 0.00879127409502723, + "loss": 3.034, + "mean_token_accuracy": 0.41368094086647034, + "num_tokens": 5591390236.0, + "step": 10939 + }, + { + "epoch": 2.9583558680367767, + "grad_norm": 2.859375, + "learning_rate": 0.008789745851327097, + "loss": 3.0481, + "mean_token_accuracy": 0.41433990001678467, + "num_tokens": 5591914415.0, + "step": 10940 + }, + { + "epoch": 2.9586262844780964, + "grad_norm": 3.234375, + "learning_rate": 0.008788217675444063, + "loss": 3.1367, + "mean_token_accuracy": 0.41107019782066345, + "num_tokens": 5592438693.0, + "step": 10941 + }, + { + "epoch": 2.958896700919416, + "grad_norm": 19.875, + "learning_rate": 0.00878668956742501, + "loss": 2.5404, + "mean_token_accuracy": 0.4852648079395294, + "num_tokens": 5592962885.0, + "step": 10942 + }, + { + "epoch": 2.9591671173607357, + "grad_norm": 4.34375, + "learning_rate": 0.008785161527316836, + "loss": 2.9411, + "mean_token_accuracy": 0.44789743423461914, + "num_tokens": 5593424356.0, + "step": 10943 + }, + { + "epoch": 2.9594375338020553, + "grad_norm": 2.484375, + "learning_rate": 0.008783633555166419, + "loss": 3.0159, + "mean_token_accuracy": 0.4185882806777954, + "num_tokens": 5593948571.0, + "step": 10944 + }, + { + "epoch": 2.959707950243375, + "grad_norm": 3.6875, + "learning_rate": 0.008782105651020637, + "loss": 3.0319, + "mean_token_accuracy": 0.4255877137184143, + "num_tokens": 5594472751.0, + "step": 10945 + }, + { + "epoch": 2.9599783666846946, + "grad_norm": 4.21875, + "learning_rate": 0.008780577814926383, + "loss": 2.8388, + "mean_token_accuracy": 0.4517187178134918, + "num_tokens": 5594996976.0, + "step": 10946 + }, + { + "epoch": 2.960248783126014, + "grad_norm": 1.9375, + "learning_rate": 0.008779050046930524, + "loss": 2.9023, + "mean_token_accuracy": 0.4275662302970886, + "num_tokens": 5595512488.0, + "step": 10947 + }, + { + "epoch": 2.960519199567334, + "grad_norm": 3.671875, + "learning_rate": 0.00877752234707994, + "loss": 3.1368, + "mean_token_accuracy": 0.4012911915779114, + "num_tokens": 5596036688.0, + "step": 10948 + }, + { + "epoch": 2.960789616008653, + "grad_norm": 3.09375, + "learning_rate": 0.008775994715421509, + "loss": 2.9921, + "mean_token_accuracy": 0.4172884225845337, + "num_tokens": 5596560816.0, + "step": 10949 + }, + { + "epoch": 2.961060032449973, + "grad_norm": 3.078125, + "learning_rate": 0.0087744671520021, + "loss": 3.0081, + "mean_token_accuracy": 0.43218159675598145, + "num_tokens": 5597074413.0, + "step": 10950 + }, + { + "epoch": 2.9613304488912924, + "grad_norm": 42.25, + "learning_rate": 0.008772939656868584, + "loss": 11.6457, + "mean_token_accuracy": 0.0, + "num_tokens": 5597598569.0, + "step": 10951 + }, + { + "epoch": 2.9616008653326125, + "grad_norm": 6.09375, + "learning_rate": 0.00877141223006783, + "loss": 3.3525, + "mean_token_accuracy": 0.3660663664340973, + "num_tokens": 5598122767.0, + "step": 10952 + }, + { + "epoch": 2.9618712817739317, + "grad_norm": 2.453125, + "learning_rate": 0.008769884871646696, + "loss": 2.9241, + "mean_token_accuracy": 0.40281325578689575, + "num_tokens": 5598646938.0, + "step": 10953 + }, + { + "epoch": 2.9621416982152518, + "grad_norm": 4.03125, + "learning_rate": 0.00876835758165206, + "loss": 2.8005, + "mean_token_accuracy": 0.4180232584476471, + "num_tokens": 5599171088.0, + "step": 10954 + }, + { + "epoch": 2.962412114656571, + "grad_norm": 3.03125, + "learning_rate": 0.008766830360130774, + "loss": 3.191, + "mean_token_accuracy": 0.39154988527297974, + "num_tokens": 5599695257.0, + "step": 10955 + }, + { + "epoch": 2.9626825310978906, + "grad_norm": 2.640625, + "learning_rate": 0.008765303207129696, + "loss": 2.7202, + "mean_token_accuracy": 0.4216591715812683, + "num_tokens": 5600219405.0, + "step": 10956 + }, + { + "epoch": 2.9629529475392102, + "grad_norm": 2.453125, + "learning_rate": 0.008763776122695693, + "loss": 3.1795, + "mean_token_accuracy": 0.4007517695426941, + "num_tokens": 5600717215.0, + "step": 10957 + }, + { + "epoch": 2.96322336398053, + "grad_norm": 2.734375, + "learning_rate": 0.008762249106875612, + "loss": 3.0072, + "mean_token_accuracy": 0.4009476900100708, + "num_tokens": 5601241435.0, + "step": 10958 + }, + { + "epoch": 2.9634937804218495, + "grad_norm": 2.5625, + "learning_rate": 0.008760722159716308, + "loss": 3.1219, + "mean_token_accuracy": 0.3942837715148926, + "num_tokens": 5601765711.0, + "step": 10959 + }, + { + "epoch": 2.963764196863169, + "grad_norm": 3.390625, + "learning_rate": 0.008759195281264637, + "loss": 3.1751, + "mean_token_accuracy": 0.4045458436012268, + "num_tokens": 5602257254.0, + "step": 10960 + }, + { + "epoch": 2.964034613304489, + "grad_norm": 2.625, + "learning_rate": 0.008757668471567445, + "loss": 3.0244, + "mean_token_accuracy": 0.3987521231174469, + "num_tokens": 5602781447.0, + "step": 10961 + }, + { + "epoch": 2.9643050297458085, + "grad_norm": 36.75, + "learning_rate": 0.008756141730671574, + "loss": 2.8614, + "mean_token_accuracy": 0.46055734157562256, + "num_tokens": 5603305670.0, + "step": 10962 + }, + { + "epoch": 2.964575446187128, + "grad_norm": 5.46875, + "learning_rate": 0.008754615058623875, + "loss": 3.361, + "mean_token_accuracy": 0.3701496124267578, + "num_tokens": 5603829889.0, + "step": 10963 + }, + { + "epoch": 2.9648458626284477, + "grad_norm": 1.8359375, + "learning_rate": 0.008753088455471193, + "loss": 3.1063, + "mean_token_accuracy": 0.39772868156433105, + "num_tokens": 5604354029.0, + "step": 10964 + }, + { + "epoch": 2.9651162790697674, + "grad_norm": 3.125, + "learning_rate": 0.008751561921260361, + "loss": 2.8978, + "mean_token_accuracy": 0.40345507860183716, + "num_tokens": 5604878240.0, + "step": 10965 + }, + { + "epoch": 2.965386695511087, + "grad_norm": 2.984375, + "learning_rate": 0.00875003545603822, + "loss": 2.8306, + "mean_token_accuracy": 0.40170156955718994, + "num_tokens": 5605380444.0, + "step": 10966 + }, + { + "epoch": 2.9656571119524067, + "grad_norm": 2.296875, + "learning_rate": 0.008748509059851613, + "loss": 2.9995, + "mean_token_accuracy": 0.4088559150695801, + "num_tokens": 5605904607.0, + "step": 10967 + }, + { + "epoch": 2.9659275283937263, + "grad_norm": 2.46875, + "learning_rate": 0.008746982732747367, + "loss": 2.76, + "mean_token_accuracy": 0.424538254737854, + "num_tokens": 5606428778.0, + "step": 10968 + }, + { + "epoch": 2.966197944835046, + "grad_norm": 2.921875, + "learning_rate": 0.008745456474772314, + "loss": 3.1836, + "mean_token_accuracy": 0.39157673716545105, + "num_tokens": 5606953040.0, + "step": 10969 + }, + { + "epoch": 2.9664683612763656, + "grad_norm": 2.546875, + "learning_rate": 0.008743930285973285, + "loss": 3.0062, + "mean_token_accuracy": 0.3871226906776428, + "num_tokens": 5607477153.0, + "step": 10970 + }, + { + "epoch": 2.9667387777176852, + "grad_norm": 36.0, + "learning_rate": 0.008742404166397115, + "loss": 11.4659, + "mean_token_accuracy": 0.010776825249195099, + "num_tokens": 5608001359.0, + "step": 10971 + }, + { + "epoch": 2.967009194159005, + "grad_norm": 6.09375, + "learning_rate": 0.008740878116090625, + "loss": 2.9538, + "mean_token_accuracy": 0.3886335492134094, + "num_tokens": 5608503084.0, + "step": 10972 + }, + { + "epoch": 2.9672796106003245, + "grad_norm": 2.6875, + "learning_rate": 0.008739352135100633, + "loss": 3.1619, + "mean_token_accuracy": 0.4022974967956543, + "num_tokens": 5609027255.0, + "step": 10973 + }, + { + "epoch": 2.967550027041644, + "grad_norm": 2.21875, + "learning_rate": 0.00873782622347397, + "loss": 2.911, + "mean_token_accuracy": 0.41025978326797485, + "num_tokens": 5609551273.0, + "step": 10974 + }, + { + "epoch": 2.967820443482964, + "grad_norm": 2.4375, + "learning_rate": 0.00873630038125745, + "loss": 2.9116, + "mean_token_accuracy": 0.42913371324539185, + "num_tokens": 5610075367.0, + "step": 10975 + }, + { + "epoch": 2.9680908599242835, + "grad_norm": 3.359375, + "learning_rate": 0.008734774608497888, + "loss": 3.0521, + "mean_token_accuracy": 0.4254758358001709, + "num_tokens": 5610599578.0, + "step": 10976 + }, + { + "epoch": 2.968361276365603, + "grad_norm": 2.8125, + "learning_rate": 0.008733248905242106, + "loss": 3.1143, + "mean_token_accuracy": 0.40723514556884766, + "num_tokens": 5611123572.0, + "step": 10977 + }, + { + "epoch": 2.9686316928069227, + "grad_norm": 3.140625, + "learning_rate": 0.008731723271536913, + "loss": 2.7891, + "mean_token_accuracy": 0.4307498335838318, + "num_tokens": 5611636311.0, + "step": 10978 + }, + { + "epoch": 2.9689021092482424, + "grad_norm": 3.25, + "learning_rate": 0.008730197707429119, + "loss": 2.7393, + "mean_token_accuracy": 0.4444577693939209, + "num_tokens": 5612142790.0, + "step": 10979 + }, + { + "epoch": 2.969172525689562, + "grad_norm": 2.9375, + "learning_rate": 0.008728672212965539, + "loss": 3.153, + "mean_token_accuracy": 0.4089258313179016, + "num_tokens": 5612614769.0, + "step": 10980 + }, + { + "epoch": 2.9694429421308817, + "grad_norm": 2.53125, + "learning_rate": 0.008727146788192973, + "loss": 2.9971, + "mean_token_accuracy": 0.39809516072273254, + "num_tokens": 5613139015.0, + "step": 10981 + }, + { + "epoch": 2.9697133585722013, + "grad_norm": 2.953125, + "learning_rate": 0.008725621433158222, + "loss": 2.7188, + "mean_token_accuracy": 0.40758293867111206, + "num_tokens": 5613663288.0, + "step": 10982 + }, + { + "epoch": 2.969983775013521, + "grad_norm": 2.171875, + "learning_rate": 0.008724096147908098, + "loss": 3.0925, + "mean_token_accuracy": 0.4262382686138153, + "num_tokens": 5614187484.0, + "step": 10983 + }, + { + "epoch": 2.9702541914548406, + "grad_norm": 3.828125, + "learning_rate": 0.0087225709324894, + "loss": 3.2671, + "mean_token_accuracy": 0.39267218112945557, + "num_tokens": 5614693867.0, + "step": 10984 + }, + { + "epoch": 2.9705246078961602, + "grad_norm": 2.90625, + "learning_rate": 0.008721045786948918, + "loss": 3.0309, + "mean_token_accuracy": 0.3979267477989197, + "num_tokens": 5615218138.0, + "step": 10985 + }, + { + "epoch": 2.97079502433748, + "grad_norm": 2.515625, + "learning_rate": 0.008719520711333459, + "loss": 2.9096, + "mean_token_accuracy": 0.42783722281455994, + "num_tokens": 5615742259.0, + "step": 10986 + }, + { + "epoch": 2.9710654407787995, + "grad_norm": 2.609375, + "learning_rate": 0.00871799570568981, + "loss": 3.0222, + "mean_token_accuracy": 0.41073092818260193, + "num_tokens": 5616266505.0, + "step": 10987 + }, + { + "epoch": 2.9713358572201187, + "grad_norm": 2.296875, + "learning_rate": 0.008716470770064759, + "loss": 2.8971, + "mean_token_accuracy": 0.42500555515289307, + "num_tokens": 5616790617.0, + "step": 10988 + }, + { + "epoch": 2.971606273661439, + "grad_norm": 2.984375, + "learning_rate": 0.008714945904505104, + "loss": 3.061, + "mean_token_accuracy": 0.4054551124572754, + "num_tokens": 5617314878.0, + "step": 10989 + }, + { + "epoch": 2.971876690102758, + "grad_norm": 3.953125, + "learning_rate": 0.008713421109057628, + "loss": 2.9839, + "mean_token_accuracy": 0.42654067277908325, + "num_tokens": 5617795728.0, + "step": 10990 + }, + { + "epoch": 2.972147106544078, + "grad_norm": 54.75, + "learning_rate": 0.008711896383769114, + "loss": 18.5697, + "mean_token_accuracy": 0.00011365170939825475, + "num_tokens": 5618319836.0, + "step": 10991 + }, + { + "epoch": 2.9724175229853973, + "grad_norm": 6.75, + "learning_rate": 0.008710371728686352, + "loss": 3.2238, + "mean_token_accuracy": 0.43076440691947937, + "num_tokens": 5618750075.0, + "step": 10992 + }, + { + "epoch": 2.9726879394267174, + "grad_norm": 2.046875, + "learning_rate": 0.008708847143856113, + "loss": 3.0168, + "mean_token_accuracy": 0.41257011890411377, + "num_tokens": 5619261501.0, + "step": 10993 + }, + { + "epoch": 2.9729583558680366, + "grad_norm": 2.90625, + "learning_rate": 0.008707322629325187, + "loss": 3.2977, + "mean_token_accuracy": 0.3938932418823242, + "num_tokens": 5619758047.0, + "step": 10994 + }, + { + "epoch": 2.9732287723093567, + "grad_norm": 3.875, + "learning_rate": 0.008705798185140346, + "loss": 3.0648, + "mean_token_accuracy": 0.4070400595664978, + "num_tokens": 5620282278.0, + "step": 10995 + }, + { + "epoch": 2.973499188750676, + "grad_norm": 2.984375, + "learning_rate": 0.008704273811348359, + "loss": 2.8282, + "mean_token_accuracy": 0.4359874427318573, + "num_tokens": 5620806498.0, + "step": 10996 + }, + { + "epoch": 2.9737696051919955, + "grad_norm": 2.890625, + "learning_rate": 0.008702749507996006, + "loss": 2.8984, + "mean_token_accuracy": 0.440166711807251, + "num_tokens": 5621321449.0, + "step": 10997 + }, + { + "epoch": 2.974040021633315, + "grad_norm": 3.71875, + "learning_rate": 0.00870122527513005, + "loss": 3.1046, + "mean_token_accuracy": 0.41367119550704956, + "num_tokens": 5621845682.0, + "step": 10998 + }, + { + "epoch": 2.974310438074635, + "grad_norm": 3.03125, + "learning_rate": 0.008699701112797265, + "loss": 3.1531, + "mean_token_accuracy": 0.4059625267982483, + "num_tokens": 5622361931.0, + "step": 10999 + }, + { + "epoch": 2.9745808545159544, + "grad_norm": 3.265625, + "learning_rate": 0.008698177021044416, + "loss": 3.095, + "mean_token_accuracy": 0.4205361604690552, + "num_tokens": 5622855985.0, + "step": 11000 + }, + { + "epoch": 2.974851270957274, + "grad_norm": 2.78125, + "learning_rate": 0.008696652999918267, + "loss": 3.0048, + "mean_token_accuracy": 0.3993185758590698, + "num_tokens": 5623380146.0, + "step": 11001 + }, + { + "epoch": 2.9751216873985937, + "grad_norm": 2.796875, + "learning_rate": 0.008695129049465572, + "loss": 3.0486, + "mean_token_accuracy": 0.4127769470214844, + "num_tokens": 5623904332.0, + "step": 11002 + }, + { + "epoch": 2.9753921038399134, + "grad_norm": 3.21875, + "learning_rate": 0.008693605169733101, + "loss": 2.9718, + "mean_token_accuracy": 0.4091704189777374, + "num_tokens": 5624428609.0, + "step": 11003 + }, + { + "epoch": 2.975662520281233, + "grad_norm": 3.109375, + "learning_rate": 0.008692081360767602, + "loss": 3.1338, + "mean_token_accuracy": 0.41591858863830566, + "num_tokens": 5624942051.0, + "step": 11004 + }, + { + "epoch": 2.9759329367225527, + "grad_norm": 2.78125, + "learning_rate": 0.008690557622615834, + "loss": 2.9444, + "mean_token_accuracy": 0.403682142496109, + "num_tokens": 5625466231.0, + "step": 11005 + }, + { + "epoch": 2.9762033531638723, + "grad_norm": 3.40625, + "learning_rate": 0.008689033955324553, + "loss": 2.9078, + "mean_token_accuracy": 0.4019094705581665, + "num_tokens": 5625990515.0, + "step": 11006 + }, + { + "epoch": 2.976473769605192, + "grad_norm": 2.8125, + "learning_rate": 0.008687510358940507, + "loss": 2.8178, + "mean_token_accuracy": 0.41375380754470825, + "num_tokens": 5626514635.0, + "step": 11007 + }, + { + "epoch": 2.9767441860465116, + "grad_norm": 3.21875, + "learning_rate": 0.008685986833510437, + "loss": 2.9831, + "mean_token_accuracy": 0.4246678948402405, + "num_tokens": 5627038889.0, + "step": 11008 + }, + { + "epoch": 2.9770146024878312, + "grad_norm": 3.625, + "learning_rate": 0.008684463379081105, + "loss": 3.0492, + "mean_token_accuracy": 0.41619300842285156, + "num_tokens": 5627563067.0, + "step": 11009 + }, + { + "epoch": 2.977285018929151, + "grad_norm": 2.90625, + "learning_rate": 0.008682939995699242, + "loss": 3.0536, + "mean_token_accuracy": 0.4004409909248352, + "num_tokens": 5628087220.0, + "step": 11010 + }, + { + "epoch": 2.9775554353704705, + "grad_norm": 4.3125, + "learning_rate": 0.008681416683411592, + "loss": 12.1735, + "mean_token_accuracy": 0.00024043835583142936, + "num_tokens": 5628588264.0, + "step": 11011 + }, + { + "epoch": 2.97782585181179, + "grad_norm": 6.75, + "learning_rate": 0.0086798934422649, + "loss": 3.4385, + "mean_token_accuracy": 0.3688510060310364, + "num_tokens": 5629112536.0, + "step": 11012 + }, + { + "epoch": 2.97809626825311, + "grad_norm": 2.109375, + "learning_rate": 0.008678370272305898, + "loss": 3.0139, + "mean_token_accuracy": 0.41086345911026, + "num_tokens": 5629636805.0, + "step": 11013 + }, + { + "epoch": 2.9783666846944294, + "grad_norm": 2.90625, + "learning_rate": 0.008676847173581321, + "loss": 3.1499, + "mean_token_accuracy": 0.4143100082874298, + "num_tokens": 5630129413.0, + "step": 11014 + }, + { + "epoch": 2.978637101135749, + "grad_norm": 3.421875, + "learning_rate": 0.00867532414613791, + "loss": 3.0704, + "mean_token_accuracy": 0.4052903354167938, + "num_tokens": 5630653545.0, + "step": 11015 + }, + { + "epoch": 2.9789075175770687, + "grad_norm": 3.4375, + "learning_rate": 0.008673801190022385, + "loss": 3.0345, + "mean_token_accuracy": 0.40366601943969727, + "num_tokens": 5631177772.0, + "step": 11016 + }, + { + "epoch": 2.9791779340183884, + "grad_norm": 2.703125, + "learning_rate": 0.008672278305281486, + "loss": 3.1102, + "mean_token_accuracy": 0.4053241014480591, + "num_tokens": 5631692267.0, + "step": 11017 + }, + { + "epoch": 2.979448350459708, + "grad_norm": 4.9375, + "learning_rate": 0.008670755491961932, + "loss": 2.8598, + "mean_token_accuracy": 0.4649341106414795, + "num_tokens": 5632216479.0, + "step": 11018 + }, + { + "epoch": 2.9797187669010277, + "grad_norm": 2.578125, + "learning_rate": 0.008669232750110447, + "loss": 3.0153, + "mean_token_accuracy": 0.40355241298675537, + "num_tokens": 5632740622.0, + "step": 11019 + }, + { + "epoch": 2.9799891833423473, + "grad_norm": 4.0625, + "learning_rate": 0.008667710079773759, + "loss": 3.0658, + "mean_token_accuracy": 0.41664785146713257, + "num_tokens": 5633264809.0, + "step": 11020 + }, + { + "epoch": 2.980259599783667, + "grad_norm": 2.734375, + "learning_rate": 0.008666187480998587, + "loss": 2.8242, + "mean_token_accuracy": 0.4154438376426697, + "num_tokens": 5633788993.0, + "step": 11021 + }, + { + "epoch": 2.9805300162249866, + "grad_norm": 2.65625, + "learning_rate": 0.008664664953831638, + "loss": 2.7733, + "mean_token_accuracy": 0.4452139437198639, + "num_tokens": 5634313160.0, + "step": 11022 + }, + { + "epoch": 2.9808004326663062, + "grad_norm": 2.78125, + "learning_rate": 0.008663142498319643, + "loss": 2.8349, + "mean_token_accuracy": 0.41361165046691895, + "num_tokens": 5634811882.0, + "step": 11023 + }, + { + "epoch": 2.981070849107626, + "grad_norm": 2.21875, + "learning_rate": 0.008661620114509308, + "loss": 3.0613, + "mean_token_accuracy": 0.40863358974456787, + "num_tokens": 5635336151.0, + "step": 11024 + }, + { + "epoch": 2.9813412655489455, + "grad_norm": 3.0, + "learning_rate": 0.00866009780244734, + "loss": 3.0852, + "mean_token_accuracy": 0.399027943611145, + "num_tokens": 5635860334.0, + "step": 11025 + }, + { + "epoch": 2.981611681990265, + "grad_norm": 2.6875, + "learning_rate": 0.008658575562180455, + "loss": 3.0559, + "mean_token_accuracy": 0.4167194068431854, + "num_tokens": 5636384564.0, + "step": 11026 + }, + { + "epoch": 2.981882098431585, + "grad_norm": 2.984375, + "learning_rate": 0.008657053393755358, + "loss": 2.9238, + "mean_token_accuracy": 0.4027509391307831, + "num_tokens": 5636908783.0, + "step": 11027 + }, + { + "epoch": 2.9821525148729044, + "grad_norm": 2.453125, + "learning_rate": 0.00865553129721875, + "loss": 3.0986, + "mean_token_accuracy": 0.4038845896720886, + "num_tokens": 5637419025.0, + "step": 11028 + }, + { + "epoch": 2.9824229313142236, + "grad_norm": 2.65625, + "learning_rate": 0.008654009272617343, + "loss": 2.833, + "mean_token_accuracy": 0.4028637409210205, + "num_tokens": 5637943304.0, + "step": 11029 + }, + { + "epoch": 2.9826933477555437, + "grad_norm": 3.59375, + "learning_rate": 0.008652487319997825, + "loss": 2.6787, + "mean_token_accuracy": 0.41625508666038513, + "num_tokens": 5638467548.0, + "step": 11030 + }, + { + "epoch": 2.982963764196863, + "grad_norm": 191.0, + "learning_rate": 0.008650965439406902, + "loss": 12.8582, + "mean_token_accuracy": 0.00024182838387787342, + "num_tokens": 5638970719.0, + "step": 11031 + }, + { + "epoch": 2.983234180638183, + "grad_norm": 6.78125, + "learning_rate": 0.008649443630891268, + "loss": 3.1752, + "mean_token_accuracy": 0.388606458902359, + "num_tokens": 5639494853.0, + "step": 11032 + }, + { + "epoch": 2.983504597079502, + "grad_norm": 2.09375, + "learning_rate": 0.008647921894497614, + "loss": 3.0373, + "mean_token_accuracy": 0.42350322008132935, + "num_tokens": 5639986002.0, + "step": 11033 + }, + { + "epoch": 2.9837750135208223, + "grad_norm": 3.125, + "learning_rate": 0.008646400230272633, + "loss": 3.2183, + "mean_token_accuracy": 0.3997440040111542, + "num_tokens": 5640468446.0, + "step": 11034 + }, + { + "epoch": 2.9840454299621415, + "grad_norm": 2.828125, + "learning_rate": 0.008644878638263017, + "loss": 2.8858, + "mean_token_accuracy": 0.3960767388343811, + "num_tokens": 5640992643.0, + "step": 11035 + }, + { + "epoch": 2.9843158464034616, + "grad_norm": 3.5, + "learning_rate": 0.00864335711851545, + "loss": 2.8784, + "mean_token_accuracy": 0.45560383796691895, + "num_tokens": 5641516893.0, + "step": 11036 + }, + { + "epoch": 2.984586262844781, + "grad_norm": 2.984375, + "learning_rate": 0.008641835671076611, + "loss": 2.918, + "mean_token_accuracy": 0.4288327097892761, + "num_tokens": 5642041118.0, + "step": 11037 + }, + { + "epoch": 2.9848566792861004, + "grad_norm": 3.390625, + "learning_rate": 0.008640314295993197, + "loss": 3.0931, + "mean_token_accuracy": 0.41180816292762756, + "num_tokens": 5642565266.0, + "step": 11038 + }, + { + "epoch": 2.98512709572742, + "grad_norm": 3.359375, + "learning_rate": 0.008638792993311875, + "loss": 3.1881, + "mean_token_accuracy": 0.40968388319015503, + "num_tokens": 5643077423.0, + "step": 11039 + }, + { + "epoch": 2.9853975121687397, + "grad_norm": 3.203125, + "learning_rate": 0.008637271763079326, + "loss": 3.0757, + "mean_token_accuracy": 0.3822733163833618, + "num_tokens": 5643601682.0, + "step": 11040 + }, + { + "epoch": 2.9856679286100594, + "grad_norm": 3.296875, + "learning_rate": 0.008635750605342234, + "loss": 3.0825, + "mean_token_accuracy": 0.422085165977478, + "num_tokens": 5644083536.0, + "step": 11041 + }, + { + "epoch": 2.985938345051379, + "grad_norm": 3.578125, + "learning_rate": 0.00863422952014726, + "loss": 2.9007, + "mean_token_accuracy": 0.4310454726219177, + "num_tokens": 5644595634.0, + "step": 11042 + }, + { + "epoch": 2.9862087614926986, + "grad_norm": 3.328125, + "learning_rate": 0.008632708507541085, + "loss": 3.074, + "mean_token_accuracy": 0.42419323325157166, + "num_tokens": 5645063829.0, + "step": 11043 + }, + { + "epoch": 2.9864791779340183, + "grad_norm": 2.90625, + "learning_rate": 0.008631187567570376, + "loss": 3.1475, + "mean_token_accuracy": 0.40464621782302856, + "num_tokens": 5645588030.0, + "step": 11044 + }, + { + "epoch": 2.986749594375338, + "grad_norm": 2.546875, + "learning_rate": 0.008629666700281795, + "loss": 2.7731, + "mean_token_accuracy": 0.43802452087402344, + "num_tokens": 5646108657.0, + "step": 11045 + }, + { + "epoch": 2.9870200108166576, + "grad_norm": 4.25, + "learning_rate": 0.008628145905722013, + "loss": 2.6335, + "mean_token_accuracy": 0.45320063829421997, + "num_tokens": 5646632870.0, + "step": 11046 + }, + { + "epoch": 2.987290427257977, + "grad_norm": 1.9296875, + "learning_rate": 0.008626625183937689, + "loss": 2.9716, + "mean_token_accuracy": 0.4060289263725281, + "num_tokens": 5647157018.0, + "step": 11047 + }, + { + "epoch": 2.987560843699297, + "grad_norm": 3.390625, + "learning_rate": 0.00862510453497548, + "loss": 2.8228, + "mean_token_accuracy": 0.4415231943130493, + "num_tokens": 5647622693.0, + "step": 11048 + }, + { + "epoch": 2.9878312601406165, + "grad_norm": 6.28125, + "learning_rate": 0.008623583958882056, + "loss": 2.7005, + "mean_token_accuracy": 0.4816310405731201, + "num_tokens": 5648123421.0, + "step": 11049 + }, + { + "epoch": 2.988101676581936, + "grad_norm": 2.375, + "learning_rate": 0.00862206345570406, + "loss": 2.9951, + "mean_token_accuracy": 0.4082842469215393, + "num_tokens": 5648623037.0, + "step": 11050 + }, + { + "epoch": 2.988372093023256, + "grad_norm": 10.8125, + "learning_rate": 0.00862054302548815, + "loss": 11.022, + "mean_token_accuracy": 2.564944452387863e-06, + "num_tokens": 5649147184.0, + "step": 11051 + }, + { + "epoch": 2.9886425094645754, + "grad_norm": 6.09375, + "learning_rate": 0.00861902266828098, + "loss": 3.4194, + "mean_token_accuracy": 0.35886192321777344, + "num_tokens": 5649671432.0, + "step": 11052 + }, + { + "epoch": 2.988912925905895, + "grad_norm": 2.96875, + "learning_rate": 0.008617502384129195, + "loss": 3.0194, + "mean_token_accuracy": 0.39998024702072144, + "num_tokens": 5650195695.0, + "step": 11053 + }, + { + "epoch": 2.9891833423472147, + "grad_norm": 3.734375, + "learning_rate": 0.008615982173079443, + "loss": 2.9627, + "mean_token_accuracy": 0.3818984925746918, + "num_tokens": 5650719930.0, + "step": 11054 + }, + { + "epoch": 2.9894537587885344, + "grad_norm": 2.890625, + "learning_rate": 0.008614462035178367, + "loss": 3.1845, + "mean_token_accuracy": 0.3944595456123352, + "num_tokens": 5651244115.0, + "step": 11055 + }, + { + "epoch": 2.989724175229854, + "grad_norm": 2.890625, + "learning_rate": 0.008612941970472616, + "loss": 3.1567, + "mean_token_accuracy": 0.41930609941482544, + "num_tokens": 5651716358.0, + "step": 11056 + }, + { + "epoch": 2.9899945916711737, + "grad_norm": 3.640625, + "learning_rate": 0.00861142197900882, + "loss": 3.0239, + "mean_token_accuracy": 0.42030462622642517, + "num_tokens": 5652154088.0, + "step": 11057 + }, + { + "epoch": 2.9902650081124933, + "grad_norm": 3.21875, + "learning_rate": 0.008609902060833625, + "loss": 3.1098, + "mean_token_accuracy": 0.4010021984577179, + "num_tokens": 5652678323.0, + "step": 11058 + }, + { + "epoch": 2.990535424553813, + "grad_norm": 3.484375, + "learning_rate": 0.008608382215993667, + "loss": 2.9611, + "mean_token_accuracy": 0.3968822658061981, + "num_tokens": 5653202604.0, + "step": 11059 + }, + { + "epoch": 2.9908058409951326, + "grad_norm": 2.78125, + "learning_rate": 0.008606862444535568, + "loss": 3.0344, + "mean_token_accuracy": 0.3850410580635071, + "num_tokens": 5653726839.0, + "step": 11060 + }, + { + "epoch": 2.9910762574364522, + "grad_norm": 2.96875, + "learning_rate": 0.00860534274650597, + "loss": 3.12, + "mean_token_accuracy": 0.3859461545944214, + "num_tokens": 5654251004.0, + "step": 11061 + }, + { + "epoch": 2.991346673877772, + "grad_norm": 2.484375, + "learning_rate": 0.008603823121951501, + "loss": 2.8609, + "mean_token_accuracy": 0.4118121564388275, + "num_tokens": 5654775240.0, + "step": 11062 + }, + { + "epoch": 2.9916170903190915, + "grad_norm": 2.8125, + "learning_rate": 0.008602303570918783, + "loss": 2.9714, + "mean_token_accuracy": 0.4146033525466919, + "num_tokens": 5655299278.0, + "step": 11063 + }, + { + "epoch": 2.991887506760411, + "grad_norm": 2.21875, + "learning_rate": 0.008600784093454446, + "loss": 2.8263, + "mean_token_accuracy": 0.42475810647010803, + "num_tokens": 5655823503.0, + "step": 11064 + }, + { + "epoch": 2.992157923201731, + "grad_norm": 2.296875, + "learning_rate": 0.008599264689605104, + "loss": 3.0146, + "mean_token_accuracy": 0.41216179728507996, + "num_tokens": 5656347711.0, + "step": 11065 + }, + { + "epoch": 2.9924283396430504, + "grad_norm": 3.25, + "learning_rate": 0.008597745359417387, + "loss": 3.0519, + "mean_token_accuracy": 0.43521058559417725, + "num_tokens": 5656825826.0, + "step": 11066 + }, + { + "epoch": 2.99269875608437, + "grad_norm": 3.390625, + "learning_rate": 0.008596226102937901, + "loss": 3.1467, + "mean_token_accuracy": 0.39418625831604004, + "num_tokens": 5657350007.0, + "step": 11067 + }, + { + "epoch": 2.9929691725256897, + "grad_norm": 2.84375, + "learning_rate": 0.008594706920213269, + "loss": 3.1129, + "mean_token_accuracy": 0.4075559079647064, + "num_tokens": 5657874240.0, + "step": 11068 + }, + { + "epoch": 2.9932395889670094, + "grad_norm": 3.21875, + "learning_rate": 0.008593187811290105, + "loss": 3.0752, + "mean_token_accuracy": 0.4070532023906708, + "num_tokens": 5658398364.0, + "step": 11069 + }, + { + "epoch": 2.9935100054083286, + "grad_norm": 2.4375, + "learning_rate": 0.008591668776215016, + "loss": 2.9024, + "mean_token_accuracy": 0.41891610622406006, + "num_tokens": 5658922631.0, + "step": 11070 + }, + { + "epoch": 2.9937804218496487, + "grad_norm": 18.25, + "learning_rate": 0.008590149815034608, + "loss": 12.8286, + "mean_token_accuracy": 0.00841442309319973, + "num_tokens": 5659446719.0, + "step": 11071 + }, + { + "epoch": 2.994050838290968, + "grad_norm": 5.71875, + "learning_rate": 0.008588630927795493, + "loss": 3.3916, + "mean_token_accuracy": 0.35143184661865234, + "num_tokens": 5659918350.0, + "step": 11072 + }, + { + "epoch": 2.994321254732288, + "grad_norm": 2.25, + "learning_rate": 0.008587112114544273, + "loss": 3.1713, + "mean_token_accuracy": 0.3892166018486023, + "num_tokens": 5660442626.0, + "step": 11073 + }, + { + "epoch": 2.994591671173607, + "grad_norm": 3.34375, + "learning_rate": 0.008585593375327548, + "loss": 3.1804, + "mean_token_accuracy": 0.41312748193740845, + "num_tokens": 5660966896.0, + "step": 11074 + }, + { + "epoch": 2.9948620876149272, + "grad_norm": 3.125, + "learning_rate": 0.008584074710191918, + "loss": 3.01, + "mean_token_accuracy": 0.41748949885368347, + "num_tokens": 5661491132.0, + "step": 11075 + }, + { + "epoch": 2.9951325040562464, + "grad_norm": 3.375, + "learning_rate": 0.008582556119183984, + "loss": 3.0524, + "mean_token_accuracy": 0.39506658911705017, + "num_tokens": 5662015391.0, + "step": 11076 + }, + { + "epoch": 2.9954029204975665, + "grad_norm": 2.734375, + "learning_rate": 0.008581037602350334, + "loss": 2.9149, + "mean_token_accuracy": 0.4114600419998169, + "num_tokens": 5662522116.0, + "step": 11077 + }, + { + "epoch": 2.9956733369388857, + "grad_norm": 3.8125, + "learning_rate": 0.008579519159737568, + "loss": 3.024, + "mean_token_accuracy": 0.41460102796554565, + "num_tokens": 5663046351.0, + "step": 11078 + }, + { + "epoch": 2.9959437533802054, + "grad_norm": 3.34375, + "learning_rate": 0.008578000791392272, + "loss": 3.2921, + "mean_token_accuracy": 0.401197612285614, + "num_tokens": 5663506533.0, + "step": 11079 + }, + { + "epoch": 2.996214169821525, + "grad_norm": 3.03125, + "learning_rate": 0.008576482497361027, + "loss": 3.0509, + "mean_token_accuracy": 0.3997255563735962, + "num_tokens": 5664030737.0, + "step": 11080 + }, + { + "epoch": 2.9964845862628446, + "grad_norm": 2.71875, + "learning_rate": 0.008574964277690434, + "loss": 3.0262, + "mean_token_accuracy": 0.38923558592796326, + "num_tokens": 5664554953.0, + "step": 11081 + }, + { + "epoch": 2.9967550027041643, + "grad_norm": 2.6875, + "learning_rate": 0.008573446132427066, + "loss": 2.9794, + "mean_token_accuracy": 0.41901957988739014, + "num_tokens": 5665079231.0, + "step": 11082 + }, + { + "epoch": 2.997025419145484, + "grad_norm": 3.328125, + "learning_rate": 0.008571928061617502, + "loss": 3.0882, + "mean_token_accuracy": 0.4042600691318512, + "num_tokens": 5665575922.0, + "step": 11083 + }, + { + "epoch": 2.9972958355868036, + "grad_norm": 3.328125, + "learning_rate": 0.008570410065308332, + "loss": 3.0751, + "mean_token_accuracy": 0.36661529541015625, + "num_tokens": 5666100082.0, + "step": 11084 + }, + { + "epoch": 2.997566252028123, + "grad_norm": 4.09375, + "learning_rate": 0.008568892143546123, + "loss": 3.0499, + "mean_token_accuracy": 0.4379462003707886, + "num_tokens": 5666561802.0, + "step": 11085 + }, + { + "epoch": 2.997836668469443, + "grad_norm": 2.703125, + "learning_rate": 0.008567374296377455, + "loss": 3.1707, + "mean_token_accuracy": 0.39141690731048584, + "num_tokens": 5667085880.0, + "step": 11086 + }, + { + "epoch": 2.9981070849107625, + "grad_norm": 2.890625, + "learning_rate": 0.008565856523848897, + "loss": 2.9371, + "mean_token_accuracy": 0.4103613495826721, + "num_tokens": 5667610104.0, + "step": 11087 + }, + { + "epoch": 2.998377501352082, + "grad_norm": 2.75, + "learning_rate": 0.008564338826007016, + "loss": 3.144, + "mean_token_accuracy": 0.38345837593078613, + "num_tokens": 5668134228.0, + "step": 11088 + }, + { + "epoch": 2.998647917793402, + "grad_norm": 3.0, + "learning_rate": 0.008562821202898387, + "loss": 3.0529, + "mean_token_accuracy": 0.4123535454273224, + "num_tokens": 5668658496.0, + "step": 11089 + }, + { + "epoch": 2.9989183342347214, + "grad_norm": 3.796875, + "learning_rate": 0.008561303654569568, + "loss": 2.9134, + "mean_token_accuracy": 0.4253634810447693, + "num_tokens": 5669158359.0, + "step": 11090 + }, + { + "epoch": 2.999188750676041, + "grad_norm": 41.75, + "learning_rate": 0.008559786181067125, + "loss": 13.8284, + "mean_token_accuracy": 0.0, + "num_tokens": 5669682571.0, + "step": 11091 + }, + { + "epoch": 2.9994591671173607, + "grad_norm": 7.09375, + "learning_rate": 0.008558268782437619, + "loss": 3.465, + "mean_token_accuracy": 0.3902944028377533, + "num_tokens": 5670206812.0, + "step": 11092 + }, + { + "epoch": 2.9997295835586804, + "grad_norm": 2.671875, + "learning_rate": 0.008556751458727608, + "loss": 2.8498, + "mean_token_accuracy": 0.4274935722351074, + "num_tokens": 5670711300.0, + "step": 11093 + }, + { + "epoch": 3.0, + "grad_norm": 3.09375, + "learning_rate": 0.008555234209983642, + "loss": 2.9291, + "mean_token_accuracy": 0.42980900406837463, + "num_tokens": 5670973437.0, + "step": 11094 + }, + { + "epoch": 3.0002704164413196, + "grad_norm": 3.921875, + "learning_rate": 0.008553717036252287, + "loss": 3.058, + "mean_token_accuracy": 0.40464064478874207, + "num_tokens": 5671459759.0, + "step": 11095 + }, + { + "epoch": 3.0005408328826393, + "grad_norm": 3.109375, + "learning_rate": 0.008552199937580085, + "loss": 3.0008, + "mean_token_accuracy": 0.42620593309402466, + "num_tokens": 5671926205.0, + "step": 11096 + }, + { + "epoch": 3.000811249323959, + "grad_norm": 3.1875, + "learning_rate": 0.008550682914013586, + "loss": 2.9684, + "mean_token_accuracy": 0.41079217195510864, + "num_tokens": 5672450349.0, + "step": 11097 + }, + { + "epoch": 3.0010816657652786, + "grad_norm": 3.140625, + "learning_rate": 0.00854916596559934, + "loss": 3.049, + "mean_token_accuracy": 0.4048665761947632, + "num_tokens": 5672937963.0, + "step": 11098 + }, + { + "epoch": 3.001352082206598, + "grad_norm": 3.109375, + "learning_rate": 0.00854764909238389, + "loss": 3.1206, + "mean_token_accuracy": 0.4017493724822998, + "num_tokens": 5673462158.0, + "step": 11099 + }, + { + "epoch": 3.001622498647918, + "grad_norm": 3.65625, + "learning_rate": 0.008546132294413776, + "loss": 2.9704, + "mean_token_accuracy": 0.4134567975997925, + "num_tokens": 5673986400.0, + "step": 11100 + }, + { + "epoch": 3.0018929150892375, + "grad_norm": 2.859375, + "learning_rate": 0.00854461557173554, + "loss": 3.1313, + "mean_token_accuracy": 0.42808204889297485, + "num_tokens": 5674447371.0, + "step": 11101 + }, + { + "epoch": 3.002163331530557, + "grad_norm": 2.84375, + "learning_rate": 0.008543098924395719, + "loss": 2.9864, + "mean_token_accuracy": 0.41539233922958374, + "num_tokens": 5674971473.0, + "step": 11102 + }, + { + "epoch": 3.002433747971877, + "grad_norm": 2.671875, + "learning_rate": 0.008541582352440848, + "loss": 3.0268, + "mean_token_accuracy": 0.4088439345359802, + "num_tokens": 5675495661.0, + "step": 11103 + }, + { + "epoch": 3.0027041644131964, + "grad_norm": 2.953125, + "learning_rate": 0.00854006585591746, + "loss": 3.0434, + "mean_token_accuracy": 0.4029124081134796, + "num_tokens": 5676019915.0, + "step": 11104 + }, + { + "epoch": 3.002974580854516, + "grad_norm": 2.9375, + "learning_rate": 0.008538549434872087, + "loss": 2.9423, + "mean_token_accuracy": 0.4041547179222107, + "num_tokens": 5676544063.0, + "step": 11105 + }, + { + "epoch": 3.0032449972958357, + "grad_norm": 2.859375, + "learning_rate": 0.008537033089351257, + "loss": 2.752, + "mean_token_accuracy": 0.42578238248825073, + "num_tokens": 5677068214.0, + "step": 11106 + }, + { + "epoch": 3.0035154137371554, + "grad_norm": 2.671875, + "learning_rate": 0.008535516819401493, + "loss": 3.0081, + "mean_token_accuracy": 0.42223697900772095, + "num_tokens": 5677592395.0, + "step": 11107 + }, + { + "epoch": 3.003785830178475, + "grad_norm": 2.9375, + "learning_rate": 0.008534000625069317, + "loss": 2.9166, + "mean_token_accuracy": 0.4287484586238861, + "num_tokens": 5678056036.0, + "step": 11108 + }, + { + "epoch": 3.0040562466197946, + "grad_norm": 2.984375, + "learning_rate": 0.008532484506401261, + "loss": 3.2124, + "mean_token_accuracy": 0.3793724477291107, + "num_tokens": 5678535753.0, + "step": 11109 + }, + { + "epoch": 3.0043266630611143, + "grad_norm": 3.109375, + "learning_rate": 0.008530968463443832, + "loss": 2.8836, + "mean_token_accuracy": 0.4196332097053528, + "num_tokens": 5679020906.0, + "step": 11110 + }, + { + "epoch": 3.004597079502434, + "grad_norm": 6.75, + "learning_rate": 0.008529452496243553, + "loss": 11.5392, + "mean_token_accuracy": 0.01310124434530735, + "num_tokens": 5679490751.0, + "step": 11111 + }, + { + "epoch": 3.0048674959437536, + "grad_norm": 6.90625, + "learning_rate": 0.008527936604846938, + "loss": 3.118, + "mean_token_accuracy": 0.39987713098526, + "num_tokens": 5680015028.0, + "step": 11112 + }, + { + "epoch": 3.005137912385073, + "grad_norm": 2.015625, + "learning_rate": 0.008526420789300498, + "loss": 2.8532, + "mean_token_accuracy": 0.43738430738449097, + "num_tokens": 5680485875.0, + "step": 11113 + }, + { + "epoch": 3.005408328826393, + "grad_norm": 3.21875, + "learning_rate": 0.008524905049650738, + "loss": 3.1792, + "mean_token_accuracy": 0.3987102508544922, + "num_tokens": 5681010074.0, + "step": 11114 + }, + { + "epoch": 3.005678745267712, + "grad_norm": 3.21875, + "learning_rate": 0.008523389385944173, + "loss": 2.8131, + "mean_token_accuracy": 0.41043820977211, + "num_tokens": 5681534283.0, + "step": 11115 + }, + { + "epoch": 3.0059491617090317, + "grad_norm": 2.171875, + "learning_rate": 0.008521873798227306, + "loss": 3.0908, + "mean_token_accuracy": 0.40353143215179443, + "num_tokens": 5682058542.0, + "step": 11116 + }, + { + "epoch": 3.0062195781503513, + "grad_norm": 2.328125, + "learning_rate": 0.008520358286546638, + "loss": 2.8818, + "mean_token_accuracy": 0.4270334839820862, + "num_tokens": 5682522685.0, + "step": 11117 + }, + { + "epoch": 3.006489994591671, + "grad_norm": 2.734375, + "learning_rate": 0.008518842850948668, + "loss": 2.9494, + "mean_token_accuracy": 0.39263471961021423, + "num_tokens": 5683046899.0, + "step": 11118 + }, + { + "epoch": 3.0067604110329906, + "grad_norm": 2.71875, + "learning_rate": 0.0085173274914799, + "loss": 2.7431, + "mean_token_accuracy": 0.4364607632160187, + "num_tokens": 5683534113.0, + "step": 11119 + }, + { + "epoch": 3.0070308274743103, + "grad_norm": 2.953125, + "learning_rate": 0.00851581220818682, + "loss": 3.1303, + "mean_token_accuracy": 0.4107791781425476, + "num_tokens": 5684011113.0, + "step": 11120 + }, + { + "epoch": 3.00730124391563, + "grad_norm": 3.453125, + "learning_rate": 0.00851429700111593, + "loss": 3.1648, + "mean_token_accuracy": 0.42171210050582886, + "num_tokens": 5684440593.0, + "step": 11121 + }, + { + "epoch": 3.0075716603569496, + "grad_norm": 3.234375, + "learning_rate": 0.008512781870313721, + "loss": 3.0955, + "mean_token_accuracy": 0.4175834059715271, + "num_tokens": 5684867939.0, + "step": 11122 + }, + { + "epoch": 3.007842076798269, + "grad_norm": 3.09375, + "learning_rate": 0.008511266815826675, + "loss": 2.9895, + "mean_token_accuracy": 0.41237562894821167, + "num_tokens": 5685392112.0, + "step": 11123 + }, + { + "epoch": 3.008112493239589, + "grad_norm": 3.171875, + "learning_rate": 0.008509751837701283, + "loss": 3.074, + "mean_token_accuracy": 0.42167365550994873, + "num_tokens": 5685883063.0, + "step": 11124 + }, + { + "epoch": 3.0083829096809085, + "grad_norm": 3.171875, + "learning_rate": 0.008508236935984027, + "loss": 3.1346, + "mean_token_accuracy": 0.4106890559196472, + "num_tokens": 5686407343.0, + "step": 11125 + }, + { + "epoch": 3.008653326122228, + "grad_norm": 3.34375, + "learning_rate": 0.008506722110721388, + "loss": 3.1347, + "mean_token_accuracy": 0.4013119637966156, + "num_tokens": 5686931462.0, + "step": 11126 + }, + { + "epoch": 3.0089237425635478, + "grad_norm": 2.6875, + "learning_rate": 0.00850520736195985, + "loss": 2.912, + "mean_token_accuracy": 0.42982107400894165, + "num_tokens": 5687455692.0, + "step": 11127 + }, + { + "epoch": 3.0091941590048674, + "grad_norm": 2.84375, + "learning_rate": 0.008503692689745887, + "loss": 3.0815, + "mean_token_accuracy": 0.4148745536804199, + "num_tokens": 5687934707.0, + "step": 11128 + }, + { + "epoch": 3.009464575446187, + "grad_norm": 2.34375, + "learning_rate": 0.008502178094125972, + "loss": 3.006, + "mean_token_accuracy": 0.44198232889175415, + "num_tokens": 5688362309.0, + "step": 11129 + }, + { + "epoch": 3.0097349918875067, + "grad_norm": 2.859375, + "learning_rate": 0.008500663575146578, + "loss": 2.9928, + "mean_token_accuracy": 0.426520973443985, + "num_tokens": 5688857803.0, + "step": 11130 + }, + { + "epoch": 3.0100054083288263, + "grad_norm": 5.3125, + "learning_rate": 0.008499149132854177, + "loss": 11.1458, + "mean_token_accuracy": 0.0, + "num_tokens": 5689345947.0, + "step": 11131 + }, + { + "epoch": 3.010275824770146, + "grad_norm": 6.21875, + "learning_rate": 0.008497634767295235, + "loss": 3.1901, + "mean_token_accuracy": 0.39176779985427856, + "num_tokens": 5689870137.0, + "step": 11132 + }, + { + "epoch": 3.0105462412114656, + "grad_norm": 2.453125, + "learning_rate": 0.008496120478516216, + "loss": 2.8385, + "mean_token_accuracy": 0.4133484363555908, + "num_tokens": 5690305259.0, + "step": 11133 + }, + { + "epoch": 3.0108166576527853, + "grad_norm": 3.09375, + "learning_rate": 0.008494606266563585, + "loss": 2.8034, + "mean_token_accuracy": 0.4106109142303467, + "num_tokens": 5690824620.0, + "step": 11134 + }, + { + "epoch": 3.011087074094105, + "grad_norm": 3.28125, + "learning_rate": 0.008493092131483802, + "loss": 2.9334, + "mean_token_accuracy": 0.42242616415023804, + "num_tokens": 5691348870.0, + "step": 11135 + }, + { + "epoch": 3.0113574905354246, + "grad_norm": 3.21875, + "learning_rate": 0.008491578073323324, + "loss": 3.0077, + "mean_token_accuracy": 0.4098808169364929, + "num_tokens": 5691873001.0, + "step": 11136 + }, + { + "epoch": 3.011627906976744, + "grad_norm": 2.8125, + "learning_rate": 0.008490064092128609, + "loss": 2.7747, + "mean_token_accuracy": 0.409695565700531, + "num_tokens": 5692397191.0, + "step": 11137 + }, + { + "epoch": 3.011898323418064, + "grad_norm": 2.65625, + "learning_rate": 0.00848855018794611, + "loss": 3.1664, + "mean_token_accuracy": 0.4031592607498169, + "num_tokens": 5692921469.0, + "step": 11138 + }, + { + "epoch": 3.0121687398593835, + "grad_norm": 3.75, + "learning_rate": 0.008487036360822273, + "loss": 3.0238, + "mean_token_accuracy": 0.3982795476913452, + "num_tokens": 5693445658.0, + "step": 11139 + }, + { + "epoch": 3.012439156300703, + "grad_norm": 3.03125, + "learning_rate": 0.008485522610803549, + "loss": 2.9289, + "mean_token_accuracy": 0.4184962809085846, + "num_tokens": 5693931340.0, + "step": 11140 + }, + { + "epoch": 3.012709572742023, + "grad_norm": 3.265625, + "learning_rate": 0.008484008937936391, + "loss": 2.9279, + "mean_token_accuracy": 0.4203457832336426, + "num_tokens": 5694455578.0, + "step": 11141 + }, + { + "epoch": 3.0129799891833424, + "grad_norm": 2.75, + "learning_rate": 0.00848249534226724, + "loss": 3.0918, + "mean_token_accuracy": 0.4074564576148987, + "num_tokens": 5694968896.0, + "step": 11142 + }, + { + "epoch": 3.013250405624662, + "grad_norm": 3.125, + "learning_rate": 0.00848098182384253, + "loss": 2.974, + "mean_token_accuracy": 0.3903396427631378, + "num_tokens": 5695493117.0, + "step": 11143 + }, + { + "epoch": 3.0135208220659817, + "grad_norm": 2.453125, + "learning_rate": 0.008479468382708709, + "loss": 2.9259, + "mean_token_accuracy": 0.4176003336906433, + "num_tokens": 5696017364.0, + "step": 11144 + }, + { + "epoch": 3.0137912385073014, + "grad_norm": 3.671875, + "learning_rate": 0.008477955018912207, + "loss": 2.9041, + "mean_token_accuracy": 0.4103400707244873, + "num_tokens": 5696541577.0, + "step": 11145 + }, + { + "epoch": 3.014061654948621, + "grad_norm": 2.625, + "learning_rate": 0.008476441732499463, + "loss": 3.0832, + "mean_token_accuracy": 0.3974708020687103, + "num_tokens": 5697065785.0, + "step": 11146 + }, + { + "epoch": 3.0143320713899406, + "grad_norm": 3.546875, + "learning_rate": 0.008474928523516907, + "loss": 3.0567, + "mean_token_accuracy": 0.4019918739795685, + "num_tokens": 5697589955.0, + "step": 11147 + }, + { + "epoch": 3.0146024878312603, + "grad_norm": 3.140625, + "learning_rate": 0.008473415392010972, + "loss": 3.045, + "mean_token_accuracy": 0.40937429666519165, + "num_tokens": 5698082651.0, + "step": 11148 + }, + { + "epoch": 3.01487290427258, + "grad_norm": 3.21875, + "learning_rate": 0.008471902338028081, + "loss": 2.9975, + "mean_token_accuracy": 0.42942649126052856, + "num_tokens": 5698568621.0, + "step": 11149 + }, + { + "epoch": 3.0151433207138996, + "grad_norm": 2.5625, + "learning_rate": 0.008470389361614664, + "loss": 2.7356, + "mean_token_accuracy": 0.402596116065979, + "num_tokens": 5699042802.0, + "step": 11150 + }, + { + "epoch": 3.015413737155219, + "grad_norm": 112.5, + "learning_rate": 0.008468876462817139, + "loss": 10.9627, + "mean_token_accuracy": 0.01512950286269188, + "num_tokens": 5699566826.0, + "step": 11151 + }, + { + "epoch": 3.015684153596539, + "grad_norm": 10.375, + "learning_rate": 0.008467363641681925, + "loss": 3.2033, + "mean_token_accuracy": 0.3594111204147339, + "num_tokens": 5700074877.0, + "step": 11152 + }, + { + "epoch": 3.0159545700378585, + "grad_norm": 2.96875, + "learning_rate": 0.008465850898255444, + "loss": 3.0934, + "mean_token_accuracy": 0.39431121945381165, + "num_tokens": 5700560309.0, + "step": 11153 + }, + { + "epoch": 3.016224986479178, + "grad_norm": 2.375, + "learning_rate": 0.00846433823258411, + "loss": 2.9248, + "mean_token_accuracy": 0.4142216145992279, + "num_tokens": 5701084538.0, + "step": 11154 + }, + { + "epoch": 3.016495402920498, + "grad_norm": 3.453125, + "learning_rate": 0.008462825644714338, + "loss": 3.2795, + "mean_token_accuracy": 0.39568668603897095, + "num_tokens": 5701608740.0, + "step": 11155 + }, + { + "epoch": 3.016765819361817, + "grad_norm": 3.171875, + "learning_rate": 0.008461313134692535, + "loss": 3.145, + "mean_token_accuracy": 0.40863972902297974, + "num_tokens": 5702086225.0, + "step": 11156 + }, + { + "epoch": 3.0170362358031366, + "grad_norm": 2.59375, + "learning_rate": 0.008459800702565108, + "loss": 3.164, + "mean_token_accuracy": 0.3920223116874695, + "num_tokens": 5702588469.0, + "step": 11157 + }, + { + "epoch": 3.0173066522444563, + "grad_norm": 3.234375, + "learning_rate": 0.008458288348378471, + "loss": 3.1543, + "mean_token_accuracy": 0.41344374418258667, + "num_tokens": 5703070963.0, + "step": 11158 + }, + { + "epoch": 3.017577068685776, + "grad_norm": 3.453125, + "learning_rate": 0.008456776072179022, + "loss": 3.0174, + "mean_token_accuracy": 0.41007187962532043, + "num_tokens": 5703595247.0, + "step": 11159 + }, + { + "epoch": 3.0178474851270956, + "grad_norm": 2.65625, + "learning_rate": 0.008455263874013156, + "loss": 3.0059, + "mean_token_accuracy": 0.40734633803367615, + "num_tokens": 5704119447.0, + "step": 11160 + }, + { + "epoch": 3.018117901568415, + "grad_norm": 2.953125, + "learning_rate": 0.008453751753927284, + "loss": 2.9884, + "mean_token_accuracy": 0.40988612174987793, + "num_tokens": 5704634337.0, + "step": 11161 + }, + { + "epoch": 3.018388318009735, + "grad_norm": 2.46875, + "learning_rate": 0.008452239711967796, + "loss": 2.9896, + "mean_token_accuracy": 0.41567832231521606, + "num_tokens": 5705154019.0, + "step": 11162 + }, + { + "epoch": 3.0186587344510545, + "grad_norm": 3.046875, + "learning_rate": 0.00845072774818108, + "loss": 3.0817, + "mean_token_accuracy": 0.40516233444213867, + "num_tokens": 5705678300.0, + "step": 11163 + }, + { + "epoch": 3.018929150892374, + "grad_norm": 3.171875, + "learning_rate": 0.008449215862613539, + "loss": 3.097, + "mean_token_accuracy": 0.4138658046722412, + "num_tokens": 5706202486.0, + "step": 11164 + }, + { + "epoch": 3.0191995673336938, + "grad_norm": 3.8125, + "learning_rate": 0.008447704055311558, + "loss": 3.0557, + "mean_token_accuracy": 0.42494335770606995, + "num_tokens": 5706721415.0, + "step": 11165 + }, + { + "epoch": 3.0194699837750134, + "grad_norm": 4.3125, + "learning_rate": 0.008446192326321518, + "loss": 3.3103, + "mean_token_accuracy": 0.4123157858848572, + "num_tokens": 5707188428.0, + "step": 11166 + }, + { + "epoch": 3.019740400216333, + "grad_norm": 2.875, + "learning_rate": 0.00844468067568981, + "loss": 2.8451, + "mean_token_accuracy": 0.41345030069351196, + "num_tokens": 5707712679.0, + "step": 11167 + }, + { + "epoch": 3.0200108166576527, + "grad_norm": 3.40625, + "learning_rate": 0.008443169103462813, + "loss": 3.1481, + "mean_token_accuracy": 0.4155346155166626, + "num_tokens": 5708236760.0, + "step": 11168 + }, + { + "epoch": 3.0202812330989723, + "grad_norm": 23.75, + "learning_rate": 0.008441657609686903, + "loss": 2.9745, + "mean_token_accuracy": 0.3961426019668579, + "num_tokens": 5708761043.0, + "step": 11169 + }, + { + "epoch": 3.020551649540292, + "grad_norm": 4.125, + "learning_rate": 0.00844014619440847, + "loss": 3.1072, + "mean_token_accuracy": 0.3967204988002777, + "num_tokens": 5709285263.0, + "step": 11170 + }, + { + "epoch": 3.0208220659816116, + "grad_norm": 262.0, + "learning_rate": 0.008438634857673875, + "loss": 25.562, + "mean_token_accuracy": 0.0, + "num_tokens": 5709809401.0, + "step": 11171 + }, + { + "epoch": 3.0210924824229313, + "grad_norm": 5.09375, + "learning_rate": 0.008437123599529493, + "loss": 2.9998, + "mean_token_accuracy": 0.40339627861976624, + "num_tokens": 5710333616.0, + "step": 11172 + }, + { + "epoch": 3.021362898864251, + "grad_norm": 2.46875, + "learning_rate": 0.008435612420021698, + "loss": 3.2605, + "mean_token_accuracy": 0.39019376039505005, + "num_tokens": 5710857797.0, + "step": 11173 + }, + { + "epoch": 3.0216333153055706, + "grad_norm": 2.84375, + "learning_rate": 0.008434101319196856, + "loss": 3.0626, + "mean_token_accuracy": 0.408540815114975, + "num_tokens": 5711382075.0, + "step": 11174 + }, + { + "epoch": 3.02190373174689, + "grad_norm": 3.15625, + "learning_rate": 0.008432590297101328, + "loss": 2.9239, + "mean_token_accuracy": 0.41500210762023926, + "num_tokens": 5711906143.0, + "step": 11175 + }, + { + "epoch": 3.02217414818821, + "grad_norm": 3.6875, + "learning_rate": 0.008431079353781486, + "loss": 2.8767, + "mean_token_accuracy": 0.4378807544708252, + "num_tokens": 5712368983.0, + "step": 11176 + }, + { + "epoch": 3.0224445646295295, + "grad_norm": 2.90625, + "learning_rate": 0.008429568489283676, + "loss": 3.1573, + "mean_token_accuracy": 0.3689318895339966, + "num_tokens": 5712893168.0, + "step": 11177 + }, + { + "epoch": 3.022714981070849, + "grad_norm": 4.75, + "learning_rate": 0.008428057703654273, + "loss": 2.6574, + "mean_token_accuracy": 0.4525989890098572, + "num_tokens": 5713417432.0, + "step": 11178 + }, + { + "epoch": 3.0229853975121688, + "grad_norm": 2.40625, + "learning_rate": 0.00842654699693962, + "loss": 3.0113, + "mean_token_accuracy": 0.43377387523651123, + "num_tokens": 5713941592.0, + "step": 11179 + }, + { + "epoch": 3.0232558139534884, + "grad_norm": 3.28125, + "learning_rate": 0.00842503636918607, + "loss": 2.897, + "mean_token_accuracy": 0.40887823700904846, + "num_tokens": 5714465789.0, + "step": 11180 + }, + { + "epoch": 3.023526230394808, + "grad_norm": 2.109375, + "learning_rate": 0.00842352582043998, + "loss": 3.0059, + "mean_token_accuracy": 0.4144211709499359, + "num_tokens": 5714990049.0, + "step": 11181 + }, + { + "epoch": 3.0237966468361277, + "grad_norm": 2.828125, + "learning_rate": 0.008422015350747694, + "loss": 3.0433, + "mean_token_accuracy": 0.4174862504005432, + "num_tokens": 5715514331.0, + "step": 11182 + }, + { + "epoch": 3.0240670632774473, + "grad_norm": 2.609375, + "learning_rate": 0.008420504960155555, + "loss": 2.903, + "mean_token_accuracy": 0.4132213294506073, + "num_tokens": 5716038449.0, + "step": 11183 + }, + { + "epoch": 3.024337479718767, + "grad_norm": 2.96875, + "learning_rate": 0.008418994648709913, + "loss": 3.0753, + "mean_token_accuracy": 0.44484585523605347, + "num_tokens": 5716497658.0, + "step": 11184 + }, + { + "epoch": 3.0246078961600866, + "grad_norm": 2.71875, + "learning_rate": 0.008417484416457106, + "loss": 2.9151, + "mean_token_accuracy": 0.43753349781036377, + "num_tokens": 5717007306.0, + "step": 11185 + }, + { + "epoch": 3.0248783126014063, + "grad_norm": 3.8125, + "learning_rate": 0.008415974263443468, + "loss": 2.9901, + "mean_token_accuracy": 0.4457928538322449, + "num_tokens": 5717531467.0, + "step": 11186 + }, + { + "epoch": 3.025148729042726, + "grad_norm": 2.453125, + "learning_rate": 0.008414464189715346, + "loss": 2.9739, + "mean_token_accuracy": 0.4028189182281494, + "num_tokens": 5718055599.0, + "step": 11187 + }, + { + "epoch": 3.0254191454840456, + "grad_norm": 3.203125, + "learning_rate": 0.00841295419531906, + "loss": 3.0253, + "mean_token_accuracy": 0.4223979711532593, + "num_tokens": 5718538567.0, + "step": 11188 + }, + { + "epoch": 3.025689561925365, + "grad_norm": 13.625, + "learning_rate": 0.008411444280300948, + "loss": 2.7308, + "mean_token_accuracy": 0.4435483515262604, + "num_tokens": 5719062623.0, + "step": 11189 + }, + { + "epoch": 3.025959978366685, + "grad_norm": 3.234375, + "learning_rate": 0.00840993444470734, + "loss": 3.1543, + "mean_token_accuracy": 0.4359395503997803, + "num_tokens": 5719522759.0, + "step": 11190 + }, + { + "epoch": 3.0262303948080045, + "grad_norm": 156.0, + "learning_rate": 0.00840842468858456, + "loss": 11.0424, + "mean_token_accuracy": 0.00325066689401865, + "num_tokens": 5720046988.0, + "step": 11191 + }, + { + "epoch": 3.026500811249324, + "grad_norm": 6.5625, + "learning_rate": 0.00840691501197893, + "loss": 3.4239, + "mean_token_accuracy": 0.35452568531036377, + "num_tokens": 5720571231.0, + "step": 11192 + }, + { + "epoch": 3.0267712276906438, + "grad_norm": 2.4375, + "learning_rate": 0.008405405414936774, + "loss": 3.2576, + "mean_token_accuracy": 0.3896874785423279, + "num_tokens": 5721095319.0, + "step": 11193 + }, + { + "epoch": 3.0270416441319634, + "grad_norm": 2.875, + "learning_rate": 0.00840389589750441, + "loss": 3.0183, + "mean_token_accuracy": 0.40818876028060913, + "num_tokens": 5721601319.0, + "step": 11194 + }, + { + "epoch": 3.027312060573283, + "grad_norm": 3.21875, + "learning_rate": 0.008402386459728153, + "loss": 3.044, + "mean_token_accuracy": 0.40798765420913696, + "num_tokens": 5722125551.0, + "step": 11195 + }, + { + "epoch": 3.0275824770146027, + "grad_norm": 2.671875, + "learning_rate": 0.008400877101654321, + "loss": 2.9066, + "mean_token_accuracy": 0.4126192331314087, + "num_tokens": 5722649808.0, + "step": 11196 + }, + { + "epoch": 3.027852893455922, + "grad_norm": 2.921875, + "learning_rate": 0.008399367823329226, + "loss": 2.9692, + "mean_token_accuracy": 0.4200543761253357, + "num_tokens": 5723173951.0, + "step": 11197 + }, + { + "epoch": 3.0281233098972415, + "grad_norm": 3.140625, + "learning_rate": 0.008397858624799167, + "loss": 3.0775, + "mean_token_accuracy": 0.4137888550758362, + "num_tokens": 5723698172.0, + "step": 11198 + }, + { + "epoch": 3.028393726338561, + "grad_norm": 3.171875, + "learning_rate": 0.008396349506110462, + "loss": 2.9566, + "mean_token_accuracy": 0.4293975234031677, + "num_tokens": 5724222361.0, + "step": 11199 + }, + { + "epoch": 3.028664142779881, + "grad_norm": 3.609375, + "learning_rate": 0.008394840467309409, + "loss": 3.1446, + "mean_token_accuracy": 0.396922767162323, + "num_tokens": 5724707387.0, + "step": 11200 + }, + { + "epoch": 3.0289345592212005, + "grad_norm": 3.265625, + "learning_rate": 0.008393331508442313, + "loss": 2.9385, + "mean_token_accuracy": 0.4163859486579895, + "num_tokens": 5725140365.0, + "step": 11201 + }, + { + "epoch": 3.02920497566252, + "grad_norm": 2.609375, + "learning_rate": 0.008391822629555474, + "loss": 2.7538, + "mean_token_accuracy": 0.4253104031085968, + "num_tokens": 5725664460.0, + "step": 11202 + }, + { + "epoch": 3.0294753921038398, + "grad_norm": 3.109375, + "learning_rate": 0.008390313830695183, + "loss": 2.8383, + "mean_token_accuracy": 0.42421233654022217, + "num_tokens": 5726188705.0, + "step": 11203 + }, + { + "epoch": 3.0297458085451594, + "grad_norm": 3.0625, + "learning_rate": 0.008388805111907737, + "loss": 2.9375, + "mean_token_accuracy": 0.42709100246429443, + "num_tokens": 5726712836.0, + "step": 11204 + }, + { + "epoch": 3.030016224986479, + "grad_norm": 2.765625, + "learning_rate": 0.008387296473239434, + "loss": 2.9782, + "mean_token_accuracy": 0.4069947600364685, + "num_tokens": 5727237059.0, + "step": 11205 + }, + { + "epoch": 3.0302866414277987, + "grad_norm": 3.125, + "learning_rate": 0.008385787914736556, + "loss": 2.9783, + "mean_token_accuracy": 0.4143422245979309, + "num_tokens": 5727761283.0, + "step": 11206 + }, + { + "epoch": 3.0305570578691183, + "grad_norm": 3.046875, + "learning_rate": 0.008384279436445395, + "loss": 3.1802, + "mean_token_accuracy": 0.3906019330024719, + "num_tokens": 5728285566.0, + "step": 11207 + }, + { + "epoch": 3.030827474310438, + "grad_norm": 2.6875, + "learning_rate": 0.008382771038412234, + "loss": 2.9113, + "mean_token_accuracy": 0.4127839505672455, + "num_tokens": 5728809752.0, + "step": 11208 + }, + { + "epoch": 3.0310978907517576, + "grad_norm": 8.0, + "learning_rate": 0.008381262720683349, + "loss": 3.0451, + "mean_token_accuracy": 0.42232745885849, + "num_tokens": 5729333860.0, + "step": 11209 + }, + { + "epoch": 3.0313683071930773, + "grad_norm": 2.078125, + "learning_rate": 0.00837975448330503, + "loss": 3.1314, + "mean_token_accuracy": 0.4003037214279175, + "num_tokens": 5729858069.0, + "step": 11210 + }, + { + "epoch": 3.031638723634397, + "grad_norm": 69.5, + "learning_rate": 0.008378246326323547, + "loss": 11.3078, + "mean_token_accuracy": 0.010400082916021347, + "num_tokens": 5730382289.0, + "step": 11211 + }, + { + "epoch": 3.0319091400757165, + "grad_norm": 7.09375, + "learning_rate": 0.008376738249785173, + "loss": 3.3233, + "mean_token_accuracy": 0.3774069547653198, + "num_tokens": 5730906452.0, + "step": 11212 + }, + { + "epoch": 3.032179556517036, + "grad_norm": 2.453125, + "learning_rate": 0.008375230253736191, + "loss": 3.0018, + "mean_token_accuracy": 0.42482614517211914, + "num_tokens": 5731430678.0, + "step": 11213 + }, + { + "epoch": 3.032449972958356, + "grad_norm": 2.828125, + "learning_rate": 0.008373722338222863, + "loss": 3.1035, + "mean_token_accuracy": 0.40882349014282227, + "num_tokens": 5731954857.0, + "step": 11214 + }, + { + "epoch": 3.0327203893996755, + "grad_norm": 3.484375, + "learning_rate": 0.008372214503291451, + "loss": 3.185, + "mean_token_accuracy": 0.39105224609375, + "num_tokens": 5732478987.0, + "step": 11215 + }, + { + "epoch": 3.032990805840995, + "grad_norm": 3.125, + "learning_rate": 0.008370706748988233, + "loss": 2.9127, + "mean_token_accuracy": 0.41921475529670715, + "num_tokens": 5733003202.0, + "step": 11216 + }, + { + "epoch": 3.0332612222823148, + "grad_norm": 2.828125, + "learning_rate": 0.008369199075359459, + "loss": 3.0181, + "mean_token_accuracy": 0.39524123072624207, + "num_tokens": 5733521616.0, + "step": 11217 + }, + { + "epoch": 3.0335316387236344, + "grad_norm": 2.59375, + "learning_rate": 0.008367691482451393, + "loss": 2.9707, + "mean_token_accuracy": 0.4107021689414978, + "num_tokens": 5734045762.0, + "step": 11218 + }, + { + "epoch": 3.033802055164954, + "grad_norm": 2.53125, + "learning_rate": 0.0083661839703103, + "loss": 3.0453, + "mean_token_accuracy": 0.4367109537124634, + "num_tokens": 5734550813.0, + "step": 11219 + }, + { + "epoch": 3.0340724716062737, + "grad_norm": 2.671875, + "learning_rate": 0.008364676538982423, + "loss": 3.0326, + "mean_token_accuracy": 0.41360312700271606, + "num_tokens": 5735075031.0, + "step": 11220 + }, + { + "epoch": 3.0343428880475933, + "grad_norm": 2.703125, + "learning_rate": 0.008363169188514019, + "loss": 3.0771, + "mean_token_accuracy": 0.4173724353313446, + "num_tokens": 5735599295.0, + "step": 11221 + }, + { + "epoch": 3.034613304488913, + "grad_norm": 3.453125, + "learning_rate": 0.008361661918951345, + "loss": 3.1751, + "mean_token_accuracy": 0.39438459277153015, + "num_tokens": 5736123467.0, + "step": 11222 + }, + { + "epoch": 3.0348837209302326, + "grad_norm": 2.953125, + "learning_rate": 0.008360154730340635, + "loss": 3.0645, + "mean_token_accuracy": 0.42712974548339844, + "num_tokens": 5736647082.0, + "step": 11223 + }, + { + "epoch": 3.0351541373715523, + "grad_norm": 3.3125, + "learning_rate": 0.008358647622728145, + "loss": 3.0742, + "mean_token_accuracy": 0.4065890312194824, + "num_tokens": 5737158577.0, + "step": 11224 + }, + { + "epoch": 3.035424553812872, + "grad_norm": 2.515625, + "learning_rate": 0.008357140596160113, + "loss": 3.0141, + "mean_token_accuracy": 0.43940868973731995, + "num_tokens": 5737677438.0, + "step": 11225 + }, + { + "epoch": 3.0356949702541915, + "grad_norm": 2.53125, + "learning_rate": 0.008355633650682776, + "loss": 2.8646, + "mean_token_accuracy": 0.4384685158729553, + "num_tokens": 5738201654.0, + "step": 11226 + }, + { + "epoch": 3.035965386695511, + "grad_norm": 2.859375, + "learning_rate": 0.008354126786342382, + "loss": 3.1037, + "mean_token_accuracy": 0.39802712202072144, + "num_tokens": 5738725863.0, + "step": 11227 + }, + { + "epoch": 3.036235803136831, + "grad_norm": 2.578125, + "learning_rate": 0.008352620003185157, + "loss": 2.7692, + "mean_token_accuracy": 0.42146188020706177, + "num_tokens": 5739250059.0, + "step": 11228 + }, + { + "epoch": 3.0365062195781505, + "grad_norm": 2.578125, + "learning_rate": 0.008351113301257334, + "loss": 3.1388, + "mean_token_accuracy": 0.4180120825767517, + "num_tokens": 5739774222.0, + "step": 11229 + }, + { + "epoch": 3.03677663601947, + "grad_norm": 2.78125, + "learning_rate": 0.008349606680605146, + "loss": 2.8693, + "mean_token_accuracy": 0.43612879514694214, + "num_tokens": 5740276174.0, + "step": 11230 + }, + { + "epoch": 3.0370470524607898, + "grad_norm": 3.25, + "learning_rate": 0.008348100141274819, + "loss": 11.0841, + "mean_token_accuracy": 9.221311302098911e-06, + "num_tokens": 5740800348.0, + "step": 11231 + }, + { + "epoch": 3.0373174689021094, + "grad_norm": 7.0, + "learning_rate": 0.008346593683312578, + "loss": 3.2791, + "mean_token_accuracy": 0.3815528452396393, + "num_tokens": 5741324505.0, + "step": 11232 + }, + { + "epoch": 3.037587885343429, + "grad_norm": 2.34375, + "learning_rate": 0.008345087306764644, + "loss": 2.9405, + "mean_token_accuracy": 0.42034274339675903, + "num_tokens": 5741848750.0, + "step": 11233 + }, + { + "epoch": 3.0378583017847487, + "grad_norm": 2.1875, + "learning_rate": 0.008343581011677243, + "loss": 2.9309, + "mean_token_accuracy": 0.42565542459487915, + "num_tokens": 5742328697.0, + "step": 11234 + }, + { + "epoch": 3.0381287182260683, + "grad_norm": 2.859375, + "learning_rate": 0.008342074798096587, + "loss": 2.8558, + "mean_token_accuracy": 0.43106722831726074, + "num_tokens": 5742816392.0, + "step": 11235 + }, + { + "epoch": 3.038399134667388, + "grad_norm": 3.375, + "learning_rate": 0.008340568666068892, + "loss": 2.9678, + "mean_token_accuracy": 0.39798593521118164, + "num_tokens": 5743340519.0, + "step": 11236 + }, + { + "epoch": 3.0386695511087076, + "grad_norm": 2.34375, + "learning_rate": 0.008339062615640373, + "loss": 2.9919, + "mean_token_accuracy": 0.4129018783569336, + "num_tokens": 5743864654.0, + "step": 11237 + }, + { + "epoch": 3.038939967550027, + "grad_norm": 3.15625, + "learning_rate": 0.008337556646857236, + "loss": 3.0778, + "mean_token_accuracy": 0.41587984561920166, + "num_tokens": 5744388834.0, + "step": 11238 + }, + { + "epoch": 3.0392103839913465, + "grad_norm": 3.125, + "learning_rate": 0.008336050759765692, + "loss": 3.0071, + "mean_token_accuracy": 0.4221867024898529, + "num_tokens": 5744912968.0, + "step": 11239 + }, + { + "epoch": 3.039480800432666, + "grad_norm": 4.6875, + "learning_rate": 0.008334544954411946, + "loss": 3.1272, + "mean_token_accuracy": 0.40468505024909973, + "num_tokens": 5745437003.0, + "step": 11240 + }, + { + "epoch": 3.0397512168739858, + "grad_norm": 3.640625, + "learning_rate": 0.008333039230842196, + "loss": 3.1043, + "mean_token_accuracy": 0.4106244444847107, + "num_tokens": 5745921372.0, + "step": 11241 + }, + { + "epoch": 3.0400216333153054, + "grad_norm": 3.484375, + "learning_rate": 0.008331533589102649, + "loss": 3.1547, + "mean_token_accuracy": 0.4276255965232849, + "num_tokens": 5746432362.0, + "step": 11242 + }, + { + "epoch": 3.040292049756625, + "grad_norm": 3.65625, + "learning_rate": 0.008330028029239498, + "loss": 2.8473, + "mean_token_accuracy": 0.41338175535202026, + "num_tokens": 5746956531.0, + "step": 11243 + }, + { + "epoch": 3.0405624661979447, + "grad_norm": 2.96875, + "learning_rate": 0.008328522551298938, + "loss": 3.0223, + "mean_token_accuracy": 0.4195249676704407, + "num_tokens": 5747480774.0, + "step": 11244 + }, + { + "epoch": 3.0408328826392643, + "grad_norm": 3.515625, + "learning_rate": 0.008327017155327162, + "loss": 3.0674, + "mean_token_accuracy": 0.41242873668670654, + "num_tokens": 5748004988.0, + "step": 11245 + }, + { + "epoch": 3.041103299080584, + "grad_norm": 2.75, + "learning_rate": 0.00832551184137036, + "loss": 3.0722, + "mean_token_accuracy": 0.4214509129524231, + "num_tokens": 5748525425.0, + "step": 11246 + }, + { + "epoch": 3.0413737155219036, + "grad_norm": 3.125, + "learning_rate": 0.00832400660947472, + "loss": 3.0667, + "mean_token_accuracy": 0.40008342266082764, + "num_tokens": 5749049651.0, + "step": 11247 + }, + { + "epoch": 3.0416441319632233, + "grad_norm": 2.984375, + "learning_rate": 0.00832250145968643, + "loss": 3.223, + "mean_token_accuracy": 0.401738703250885, + "num_tokens": 5749532994.0, + "step": 11248 + }, + { + "epoch": 3.041914548404543, + "grad_norm": 2.953125, + "learning_rate": 0.008320996392051664, + "loss": 2.9673, + "mean_token_accuracy": 0.4138105511665344, + "num_tokens": 5750012248.0, + "step": 11249 + }, + { + "epoch": 3.0421849648458625, + "grad_norm": 2.4375, + "learning_rate": 0.008319491406616612, + "loss": 3.1093, + "mean_token_accuracy": 0.4032090902328491, + "num_tokens": 5750536488.0, + "step": 11250 + }, + { + "epoch": 3.042455381287182, + "grad_norm": 2.953125, + "learning_rate": 0.008317986503427447, + "loss": 10.1403, + "mean_token_accuracy": 0.00039706286042928696, + "num_tokens": 5751060752.0, + "step": 11251 + }, + { + "epoch": 3.042725797728502, + "grad_norm": 9.0, + "learning_rate": 0.008316481682530338, + "loss": 3.4958, + "mean_token_accuracy": 0.37341833114624023, + "num_tokens": 5751585010.0, + "step": 11252 + }, + { + "epoch": 3.0429962141698215, + "grad_norm": 2.890625, + "learning_rate": 0.008314976943971467, + "loss": 2.9336, + "mean_token_accuracy": 0.39468270540237427, + "num_tokens": 5752109035.0, + "step": 11253 + }, + { + "epoch": 3.043266630611141, + "grad_norm": 2.9375, + "learning_rate": 0.008313472287796999, + "loss": 3.0507, + "mean_token_accuracy": 0.40563127398490906, + "num_tokens": 5752633280.0, + "step": 11254 + }, + { + "epoch": 3.0435370470524608, + "grad_norm": 3.03125, + "learning_rate": 0.0083119677140531, + "loss": 2.9091, + "mean_token_accuracy": 0.4587528109550476, + "num_tokens": 5753157397.0, + "step": 11255 + }, + { + "epoch": 3.0438074634937804, + "grad_norm": 3.203125, + "learning_rate": 0.008310463222785938, + "loss": 3.1199, + "mean_token_accuracy": 0.403568834066391, + "num_tokens": 5753646593.0, + "step": 11256 + }, + { + "epoch": 3.0440778799351, + "grad_norm": 3.140625, + "learning_rate": 0.008308958814041676, + "loss": 3.0739, + "mean_token_accuracy": 0.41864126920700073, + "num_tokens": 5754138175.0, + "step": 11257 + }, + { + "epoch": 3.0443482963764197, + "grad_norm": 3.484375, + "learning_rate": 0.008307454487866467, + "loss": 2.9423, + "mean_token_accuracy": 0.4059707522392273, + "num_tokens": 5754625742.0, + "step": 11258 + }, + { + "epoch": 3.0446187128177393, + "grad_norm": 2.671875, + "learning_rate": 0.008305950244306476, + "loss": 3.1587, + "mean_token_accuracy": 0.40566015243530273, + "num_tokens": 5755149942.0, + "step": 11259 + }, + { + "epoch": 3.044889129259059, + "grad_norm": 3.0, + "learning_rate": 0.00830444608340785, + "loss": 3.0731, + "mean_token_accuracy": 0.41664475202560425, + "num_tokens": 5755617560.0, + "step": 11260 + }, + { + "epoch": 3.0451595457003786, + "grad_norm": 2.90625, + "learning_rate": 0.008302942005216748, + "loss": 2.7545, + "mean_token_accuracy": 0.43398937582969666, + "num_tokens": 5756109413.0, + "step": 11261 + }, + { + "epoch": 3.0454299621416983, + "grad_norm": 2.546875, + "learning_rate": 0.008301438009779316, + "loss": 2.9344, + "mean_token_accuracy": 0.4080658555030823, + "num_tokens": 5756633574.0, + "step": 11262 + }, + { + "epoch": 3.045700378583018, + "grad_norm": 2.90625, + "learning_rate": 0.008299934097141703, + "loss": 3.0299, + "mean_token_accuracy": 0.4224625825881958, + "num_tokens": 5757157797.0, + "step": 11263 + }, + { + "epoch": 3.0459707950243375, + "grad_norm": 2.515625, + "learning_rate": 0.00829843026735005, + "loss": 2.93, + "mean_token_accuracy": 0.4228495657444, + "num_tokens": 5757673339.0, + "step": 11264 + }, + { + "epoch": 3.046241211465657, + "grad_norm": 2.53125, + "learning_rate": 0.008296926520450505, + "loss": 3.1022, + "mean_token_accuracy": 0.4312714636325836, + "num_tokens": 5758125105.0, + "step": 11265 + }, + { + "epoch": 3.046511627906977, + "grad_norm": 3.078125, + "learning_rate": 0.008295422856489201, + "loss": 2.7273, + "mean_token_accuracy": 0.42604950070381165, + "num_tokens": 5758649284.0, + "step": 11266 + }, + { + "epoch": 3.0467820443482965, + "grad_norm": 2.953125, + "learning_rate": 0.008293919275512276, + "loss": 2.98, + "mean_token_accuracy": 0.42087697982788086, + "num_tokens": 5759173481.0, + "step": 11267 + }, + { + "epoch": 3.047052460789616, + "grad_norm": 3.375, + "learning_rate": 0.008292415777565867, + "loss": 3.2791, + "mean_token_accuracy": 0.3934863805770874, + "num_tokens": 5759697743.0, + "step": 11268 + }, + { + "epoch": 3.0473228772309358, + "grad_norm": 3.015625, + "learning_rate": 0.008290912362696103, + "loss": 2.9988, + "mean_token_accuracy": 0.4591907858848572, + "num_tokens": 5760157177.0, + "step": 11269 + }, + { + "epoch": 3.0475932936722554, + "grad_norm": 3.21875, + "learning_rate": 0.00828940903094912, + "loss": 3.1386, + "mean_token_accuracy": 0.38643670082092285, + "num_tokens": 5760681404.0, + "step": 11270 + }, + { + "epoch": 3.047863710113575, + "grad_norm": 65.5, + "learning_rate": 0.008287905782371038, + "loss": 12.0713, + "mean_token_accuracy": 0.012315822765231133, + "num_tokens": 5761205576.0, + "step": 11271 + }, + { + "epoch": 3.0481341265548947, + "grad_norm": 5.375, + "learning_rate": 0.008286402617007976, + "loss": 3.123, + "mean_token_accuracy": 0.3952412009239197, + "num_tokens": 5761729641.0, + "step": 11272 + }, + { + "epoch": 3.0484045429962143, + "grad_norm": 1.9453125, + "learning_rate": 0.008284899534906067, + "loss": 3.1501, + "mean_token_accuracy": 0.3886764943599701, + "num_tokens": 5762253916.0, + "step": 11273 + }, + { + "epoch": 3.048674959437534, + "grad_norm": 2.84375, + "learning_rate": 0.008283396536111425, + "loss": 3.1341, + "mean_token_accuracy": 0.4261831045150757, + "num_tokens": 5762724310.0, + "step": 11274 + }, + { + "epoch": 3.0489453758788536, + "grad_norm": 3.65625, + "learning_rate": 0.008281893620670163, + "loss": 2.871, + "mean_token_accuracy": 0.4187505841255188, + "num_tokens": 5763190621.0, + "step": 11275 + }, + { + "epoch": 3.0492157923201733, + "grad_norm": 3.609375, + "learning_rate": 0.0082803907886284, + "loss": 2.9911, + "mean_token_accuracy": 0.41371631622314453, + "num_tokens": 5763714851.0, + "step": 11276 + }, + { + "epoch": 3.049486208761493, + "grad_norm": 3.296875, + "learning_rate": 0.008278888040032248, + "loss": 3.1064, + "mean_token_accuracy": 0.3843362033367157, + "num_tokens": 5764239024.0, + "step": 11277 + }, + { + "epoch": 3.0497566252028125, + "grad_norm": 2.5625, + "learning_rate": 0.008277385374927808, + "loss": 2.6875, + "mean_token_accuracy": 0.46001946926116943, + "num_tokens": 5764707713.0, + "step": 11278 + }, + { + "epoch": 3.0500270416441317, + "grad_norm": 2.8125, + "learning_rate": 0.008275882793361198, + "loss": 3.1005, + "mean_token_accuracy": 0.37621814012527466, + "num_tokens": 5765231928.0, + "step": 11279 + }, + { + "epoch": 3.0502974580854514, + "grad_norm": 2.796875, + "learning_rate": 0.00827438029537851, + "loss": 3.074, + "mean_token_accuracy": 0.40555712580680847, + "num_tokens": 5765756141.0, + "step": 11280 + }, + { + "epoch": 3.050567874526771, + "grad_norm": 5.96875, + "learning_rate": 0.008272877881025851, + "loss": 2.9155, + "mean_token_accuracy": 0.4507221579551697, + "num_tokens": 5766280390.0, + "step": 11281 + }, + { + "epoch": 3.0508382909680907, + "grad_norm": 2.390625, + "learning_rate": 0.008271375550349317, + "loss": 2.9169, + "mean_token_accuracy": 0.4294925928115845, + "num_tokens": 5766768963.0, + "step": 11282 + }, + { + "epoch": 3.0511087074094103, + "grad_norm": 3.53125, + "learning_rate": 0.008269873303395011, + "loss": 2.8339, + "mean_token_accuracy": 0.4568747580051422, + "num_tokens": 5767227738.0, + "step": 11283 + }, + { + "epoch": 3.05137912385073, + "grad_norm": 2.46875, + "learning_rate": 0.008268371140209015, + "loss": 2.8068, + "mean_token_accuracy": 0.4225354492664337, + "num_tokens": 5767751924.0, + "step": 11284 + }, + { + "epoch": 3.0516495402920496, + "grad_norm": 3.015625, + "learning_rate": 0.00826686906083743, + "loss": 2.6594, + "mean_token_accuracy": 0.4194777011871338, + "num_tokens": 5768276094.0, + "step": 11285 + }, + { + "epoch": 3.0519199567333692, + "grad_norm": 2.96875, + "learning_rate": 0.008265367065326339, + "loss": 2.8713, + "mean_token_accuracy": 0.41942548751831055, + "num_tokens": 5768800253.0, + "step": 11286 + }, + { + "epoch": 3.052190373174689, + "grad_norm": 10.75, + "learning_rate": 0.008263865153721824, + "loss": 2.5922, + "mean_token_accuracy": 0.49551817774772644, + "num_tokens": 5769270394.0, + "step": 11287 + }, + { + "epoch": 3.0524607896160085, + "grad_norm": 3.28125, + "learning_rate": 0.008262363326069975, + "loss": 3.2036, + "mean_token_accuracy": 0.3956858515739441, + "num_tokens": 5769794662.0, + "step": 11288 + }, + { + "epoch": 3.052731206057328, + "grad_norm": 3.265625, + "learning_rate": 0.008260861582416871, + "loss": 3.1609, + "mean_token_accuracy": 0.4158989489078522, + "num_tokens": 5770229739.0, + "step": 11289 + }, + { + "epoch": 3.053001622498648, + "grad_norm": 2.71875, + "learning_rate": 0.008259359922808589, + "loss": 2.9494, + "mean_token_accuracy": 0.415847510099411, + "num_tokens": 5770706487.0, + "step": 11290 + }, + { + "epoch": 3.0532720389399675, + "grad_norm": 17.25, + "learning_rate": 0.008257858347291204, + "loss": 13.3607, + "mean_token_accuracy": 0.00955368485301733, + "num_tokens": 5771230668.0, + "step": 11291 + }, + { + "epoch": 3.053542455381287, + "grad_norm": 6.15625, + "learning_rate": 0.008256356855910788, + "loss": 3.2006, + "mean_token_accuracy": 0.3889623284339905, + "num_tokens": 5771754887.0, + "step": 11292 + }, + { + "epoch": 3.0538128718226067, + "grad_norm": 3.359375, + "learning_rate": 0.008254855448713414, + "loss": 2.9178, + "mean_token_accuracy": 0.46065592765808105, + "num_tokens": 5772181760.0, + "step": 11293 + }, + { + "epoch": 3.0540832882639264, + "grad_norm": 2.328125, + "learning_rate": 0.00825335412574515, + "loss": 3.033, + "mean_token_accuracy": 0.42211395502090454, + "num_tokens": 5772663997.0, + "step": 11294 + }, + { + "epoch": 3.054353704705246, + "grad_norm": 2.78125, + "learning_rate": 0.008251852887052054, + "loss": 3.0407, + "mean_token_accuracy": 0.4069522023200989, + "num_tokens": 5773188200.0, + "step": 11295 + }, + { + "epoch": 3.0546241211465657, + "grad_norm": 3.21875, + "learning_rate": 0.008250351732680194, + "loss": 3.0987, + "mean_token_accuracy": 0.3877354562282562, + "num_tokens": 5773712469.0, + "step": 11296 + }, + { + "epoch": 3.0548945375878853, + "grad_norm": 3.171875, + "learning_rate": 0.008248850662675632, + "loss": 3.085, + "mean_token_accuracy": 0.39756274223327637, + "num_tokens": 5774236641.0, + "step": 11297 + }, + { + "epoch": 3.055164954029205, + "grad_norm": 3.234375, + "learning_rate": 0.008247349677084417, + "loss": 3.2447, + "mean_token_accuracy": 0.4117880165576935, + "num_tokens": 5774760913.0, + "step": 11298 + }, + { + "epoch": 3.0554353704705246, + "grad_norm": 2.265625, + "learning_rate": 0.008245848775952613, + "loss": 2.8563, + "mean_token_accuracy": 0.4114018380641937, + "num_tokens": 5775228722.0, + "step": 11299 + }, + { + "epoch": 3.0557057869118442, + "grad_norm": 2.71875, + "learning_rate": 0.008244347959326269, + "loss": 2.8984, + "mean_token_accuracy": 0.42382633686065674, + "num_tokens": 5775752938.0, + "step": 11300 + }, + { + "epoch": 3.055976203353164, + "grad_norm": 3.71875, + "learning_rate": 0.008242847227251429, + "loss": 3.194, + "mean_token_accuracy": 0.3765431046485901, + "num_tokens": 5776277206.0, + "step": 11301 + }, + { + "epoch": 3.0562466197944835, + "grad_norm": 3.21875, + "learning_rate": 0.008241346579774149, + "loss": 3.0726, + "mean_token_accuracy": 0.4104827046394348, + "num_tokens": 5776801335.0, + "step": 11302 + }, + { + "epoch": 3.056517036235803, + "grad_norm": 3.265625, + "learning_rate": 0.008239846016940467, + "loss": 2.9297, + "mean_token_accuracy": 0.41877633333206177, + "num_tokens": 5777325514.0, + "step": 11303 + }, + { + "epoch": 3.056787452677123, + "grad_norm": 2.59375, + "learning_rate": 0.008238345538796423, + "loss": 3.0007, + "mean_token_accuracy": 0.42055314779281616, + "num_tokens": 5777812831.0, + "step": 11304 + }, + { + "epoch": 3.0570578691184425, + "grad_norm": 3.390625, + "learning_rate": 0.008236845145388064, + "loss": 2.9354, + "mean_token_accuracy": 0.3873305022716522, + "num_tokens": 5778336989.0, + "step": 11305 + }, + { + "epoch": 3.057328285559762, + "grad_norm": 2.5, + "learning_rate": 0.008235344836761423, + "loss": 2.9177, + "mean_token_accuracy": 0.4274677634239197, + "num_tokens": 5778861266.0, + "step": 11306 + }, + { + "epoch": 3.0575987020010817, + "grad_norm": 4.34375, + "learning_rate": 0.008233844612962525, + "loss": 3.263, + "mean_token_accuracy": 0.39306774735450745, + "num_tokens": 5779385444.0, + "step": 11307 + }, + { + "epoch": 3.0578691184424014, + "grad_norm": 2.515625, + "learning_rate": 0.008232344474037414, + "loss": 2.9189, + "mean_token_accuracy": 0.4244934320449829, + "num_tokens": 5779909722.0, + "step": 11308 + }, + { + "epoch": 3.058139534883721, + "grad_norm": 5.28125, + "learning_rate": 0.008230844420032116, + "loss": 2.928, + "mean_token_accuracy": 0.39704006910324097, + "num_tokens": 5780433845.0, + "step": 11309 + }, + { + "epoch": 3.0584099513250407, + "grad_norm": 2.515625, + "learning_rate": 0.008229344450992649, + "loss": 3.0054, + "mean_token_accuracy": 0.4322473406791687, + "num_tokens": 5780893297.0, + "step": 11310 + }, + { + "epoch": 3.0586803677663603, + "grad_norm": 4.25, + "learning_rate": 0.008227844566965048, + "loss": 10.1915, + "mean_token_accuracy": 0.0025775539688766003, + "num_tokens": 5781399046.0, + "step": 11311 + }, + { + "epoch": 3.05895078420768, + "grad_norm": 9.125, + "learning_rate": 0.008226344767995327, + "loss": 3.2934, + "mean_token_accuracy": 0.3599475622177124, + "num_tokens": 5781923169.0, + "step": 11312 + }, + { + "epoch": 3.0592212006489996, + "grad_norm": 2.5, + "learning_rate": 0.0082248450541295, + "loss": 2.9524, + "mean_token_accuracy": 0.4185011386871338, + "num_tokens": 5782447452.0, + "step": 11313 + }, + { + "epoch": 3.0594916170903192, + "grad_norm": 2.828125, + "learning_rate": 0.008223345425413593, + "loss": 2.9428, + "mean_token_accuracy": 0.40816670656204224, + "num_tokens": 5782971631.0, + "step": 11314 + }, + { + "epoch": 3.059762033531639, + "grad_norm": 3.671875, + "learning_rate": 0.00822184588189361, + "loss": 3.0737, + "mean_token_accuracy": 0.40684184432029724, + "num_tokens": 5783472421.0, + "step": 11315 + }, + { + "epoch": 3.0600324499729585, + "grad_norm": 2.90625, + "learning_rate": 0.00822034642361557, + "loss": 2.9688, + "mean_token_accuracy": 0.42471563816070557, + "num_tokens": 5783996648.0, + "step": 11316 + }, + { + "epoch": 3.060302866414278, + "grad_norm": 3.03125, + "learning_rate": 0.008218847050625476, + "loss": 2.8661, + "mean_token_accuracy": 0.4128414988517761, + "num_tokens": 5784520917.0, + "step": 11317 + }, + { + "epoch": 3.060573282855598, + "grad_norm": 2.28125, + "learning_rate": 0.008217347762969326, + "loss": 2.8894, + "mean_token_accuracy": 0.4185899496078491, + "num_tokens": 5785045088.0, + "step": 11318 + }, + { + "epoch": 3.0608436992969175, + "grad_norm": 3.109375, + "learning_rate": 0.008215848560693137, + "loss": 3.078, + "mean_token_accuracy": 0.4180142879486084, + "num_tokens": 5785569254.0, + "step": 11319 + }, + { + "epoch": 3.0611141157382367, + "grad_norm": 3.609375, + "learning_rate": 0.008214349443842903, + "loss": 3.2687, + "mean_token_accuracy": 0.3916665315628052, + "num_tokens": 5786093537.0, + "step": 11320 + }, + { + "epoch": 3.0613845321795563, + "grad_norm": 2.609375, + "learning_rate": 0.008212850412464614, + "loss": 2.8777, + "mean_token_accuracy": 0.4216341972351074, + "num_tokens": 5786617823.0, + "step": 11321 + }, + { + "epoch": 3.061654948620876, + "grad_norm": 3.109375, + "learning_rate": 0.008211351466604278, + "loss": 2.987, + "mean_token_accuracy": 0.4005717635154724, + "num_tokens": 5787142062.0, + "step": 11322 + }, + { + "epoch": 3.0619253650621956, + "grad_norm": 3.515625, + "learning_rate": 0.008209852606307876, + "loss": 2.9024, + "mean_token_accuracy": 0.43570828437805176, + "num_tokens": 5787666287.0, + "step": 11323 + }, + { + "epoch": 3.0621957815035152, + "grad_norm": 2.109375, + "learning_rate": 0.0082083538316214, + "loss": 2.936, + "mean_token_accuracy": 0.425052672624588, + "num_tokens": 5788190437.0, + "step": 11324 + }, + { + "epoch": 3.062466197944835, + "grad_norm": 3.390625, + "learning_rate": 0.00820685514259084, + "loss": 3.0246, + "mean_token_accuracy": 0.417726993560791, + "num_tokens": 5788680351.0, + "step": 11325 + }, + { + "epoch": 3.0627366143861545, + "grad_norm": 2.671875, + "learning_rate": 0.008205356539262179, + "loss": 2.7739, + "mean_token_accuracy": 0.4361386299133301, + "num_tokens": 5789204480.0, + "step": 11326 + }, + { + "epoch": 3.063007030827474, + "grad_norm": 3.796875, + "learning_rate": 0.008203858021681394, + "loss": 3.212, + "mean_token_accuracy": 0.40349745750427246, + "num_tokens": 5789728734.0, + "step": 11327 + }, + { + "epoch": 3.063277447268794, + "grad_norm": 2.46875, + "learning_rate": 0.008202359589894473, + "loss": 2.8134, + "mean_token_accuracy": 0.4120529592037201, + "num_tokens": 5790252973.0, + "step": 11328 + }, + { + "epoch": 3.0635478637101135, + "grad_norm": 2.8125, + "learning_rate": 0.008200861243947387, + "loss": 2.8802, + "mean_token_accuracy": 0.4211721420288086, + "num_tokens": 5790777146.0, + "step": 11329 + }, + { + "epoch": 3.063818280151433, + "grad_norm": 2.796875, + "learning_rate": 0.008199362983886103, + "loss": 2.8602, + "mean_token_accuracy": 0.4183773398399353, + "num_tokens": 5791301334.0, + "step": 11330 + }, + { + "epoch": 3.0640886965927527, + "grad_norm": 17.875, + "learning_rate": 0.008197864809756603, + "loss": 12.7058, + "mean_token_accuracy": 0.01576024480164051, + "num_tokens": 5791825527.0, + "step": 11331 + }, + { + "epoch": 3.0643591130340724, + "grad_norm": 5.46875, + "learning_rate": 0.008196366721604852, + "loss": 2.9979, + "mean_token_accuracy": 0.407870352268219, + "num_tokens": 5792349722.0, + "step": 11332 + }, + { + "epoch": 3.064629529475392, + "grad_norm": 2.328125, + "learning_rate": 0.008194868719476814, + "loss": 2.9413, + "mean_token_accuracy": 0.41836225986480713, + "num_tokens": 5792873880.0, + "step": 11333 + }, + { + "epoch": 3.0648999459167117, + "grad_norm": 2.515625, + "learning_rate": 0.008193370803418453, + "loss": 3.0472, + "mean_token_accuracy": 0.41526252031326294, + "num_tokens": 5793338942.0, + "step": 11334 + }, + { + "epoch": 3.0651703623580313, + "grad_norm": 2.328125, + "learning_rate": 0.008191872973475733, + "loss": 2.9857, + "mean_token_accuracy": 0.4280940294265747, + "num_tokens": 5793830228.0, + "step": 11335 + }, + { + "epoch": 3.065440778799351, + "grad_norm": 2.140625, + "learning_rate": 0.008190375229694602, + "loss": 2.9443, + "mean_token_accuracy": 0.421832412481308, + "num_tokens": 5794344395.0, + "step": 11336 + }, + { + "epoch": 3.0657111952406706, + "grad_norm": 3.0625, + "learning_rate": 0.008188877572121025, + "loss": 3.0894, + "mean_token_accuracy": 0.39580971002578735, + "num_tokens": 5794868583.0, + "step": 11337 + }, + { + "epoch": 3.0659816116819902, + "grad_norm": 3.9375, + "learning_rate": 0.00818738000080095, + "loss": 2.9234, + "mean_token_accuracy": 0.43163999915122986, + "num_tokens": 5795392837.0, + "step": 11338 + }, + { + "epoch": 3.06625202812331, + "grad_norm": 3.3125, + "learning_rate": 0.008185882515780327, + "loss": 3.1662, + "mean_token_accuracy": 0.40332579612731934, + "num_tokens": 5795893425.0, + "step": 11339 + }, + { + "epoch": 3.0665224445646295, + "grad_norm": 4.0625, + "learning_rate": 0.00818438511710511, + "loss": 2.9541, + "mean_token_accuracy": 0.39651715755462646, + "num_tokens": 5796417521.0, + "step": 11340 + }, + { + "epoch": 3.066792861005949, + "grad_norm": 2.828125, + "learning_rate": 0.008182887804821229, + "loss": 3.1413, + "mean_token_accuracy": 0.417786180973053, + "num_tokens": 5796881846.0, + "step": 11341 + }, + { + "epoch": 3.067063277447269, + "grad_norm": 4.3125, + "learning_rate": 0.008181390578974643, + "loss": 3.063, + "mean_token_accuracy": 0.4090462923049927, + "num_tokens": 5797405888.0, + "step": 11342 + }, + { + "epoch": 3.0673336938885885, + "grad_norm": 4.53125, + "learning_rate": 0.008179893439611282, + "loss": 2.9989, + "mean_token_accuracy": 0.4299042224884033, + "num_tokens": 5797900670.0, + "step": 11343 + }, + { + "epoch": 3.067604110329908, + "grad_norm": 2.609375, + "learning_rate": 0.008178396386777077, + "loss": 3.1314, + "mean_token_accuracy": 0.3961699903011322, + "num_tokens": 5798424945.0, + "step": 11344 + }, + { + "epoch": 3.0678745267712277, + "grad_norm": 3.53125, + "learning_rate": 0.008176899420517974, + "loss": 3.0859, + "mean_token_accuracy": 0.40796712040901184, + "num_tokens": 5798949207.0, + "step": 11345 + }, + { + "epoch": 3.0681449432125474, + "grad_norm": 3.40625, + "learning_rate": 0.008175402540879896, + "loss": 2.6553, + "mean_token_accuracy": 0.46342939138412476, + "num_tokens": 5799473476.0, + "step": 11346 + }, + { + "epoch": 3.068415359653867, + "grad_norm": 1.96875, + "learning_rate": 0.008173905747908777, + "loss": 2.9394, + "mean_token_accuracy": 0.4320164620876312, + "num_tokens": 5799997660.0, + "step": 11347 + }, + { + "epoch": 3.0686857760951867, + "grad_norm": 3.375, + "learning_rate": 0.008172409041650541, + "loss": 3.1804, + "mean_token_accuracy": 0.4030234217643738, + "num_tokens": 5800521848.0, + "step": 11348 + }, + { + "epoch": 3.0689561925365063, + "grad_norm": 2.625, + "learning_rate": 0.008170912422151112, + "loss": 2.72, + "mean_token_accuracy": 0.4069764018058777, + "num_tokens": 5801045941.0, + "step": 11349 + }, + { + "epoch": 3.069226608977826, + "grad_norm": 2.890625, + "learning_rate": 0.008169415889456404, + "loss": 2.9442, + "mean_token_accuracy": 0.40558844804763794, + "num_tokens": 5801566551.0, + "step": 11350 + }, + { + "epoch": 3.0694970254191456, + "grad_norm": 26.375, + "learning_rate": 0.008167919443612346, + "loss": 13.5266, + "mean_token_accuracy": 0.00048339233035221696, + "num_tokens": 5802046396.0, + "step": 11351 + }, + { + "epoch": 3.0697674418604652, + "grad_norm": 6.6875, + "learning_rate": 0.008166423084664847, + "loss": 3.4031, + "mean_token_accuracy": 0.3923024833202362, + "num_tokens": 5802570656.0, + "step": 11352 + }, + { + "epoch": 3.070037858301785, + "grad_norm": 2.515625, + "learning_rate": 0.008164926812659819, + "loss": 3.0222, + "mean_token_accuracy": 0.4111054241657257, + "num_tokens": 5803036239.0, + "step": 11353 + }, + { + "epoch": 3.0703082747431045, + "grad_norm": 2.21875, + "learning_rate": 0.008163430627643176, + "loss": 2.9419, + "mean_token_accuracy": 0.4145514965057373, + "num_tokens": 5803560497.0, + "step": 11354 + }, + { + "epoch": 3.070578691184424, + "grad_norm": 2.953125, + "learning_rate": 0.008161934529660823, + "loss": 3.0338, + "mean_token_accuracy": 0.40930137038230896, + "num_tokens": 5804075762.0, + "step": 11355 + }, + { + "epoch": 3.070849107625744, + "grad_norm": 2.515625, + "learning_rate": 0.008160438518758662, + "loss": 2.9544, + "mean_token_accuracy": 0.42388105392456055, + "num_tokens": 5804596550.0, + "step": 11356 + }, + { + "epoch": 3.0711195240670635, + "grad_norm": 2.703125, + "learning_rate": 0.008158942594982602, + "loss": 3.0135, + "mean_token_accuracy": 0.41696375608444214, + "num_tokens": 5805067887.0, + "step": 11357 + }, + { + "epoch": 3.071389940508383, + "grad_norm": 2.265625, + "learning_rate": 0.008157446758378538, + "loss": 2.8956, + "mean_token_accuracy": 0.4251260757446289, + "num_tokens": 5805592116.0, + "step": 11358 + }, + { + "epoch": 3.0716603569497027, + "grad_norm": 2.5625, + "learning_rate": 0.008155951008992362, + "loss": 3.0741, + "mean_token_accuracy": 0.43119797110557556, + "num_tokens": 5806058718.0, + "step": 11359 + }, + { + "epoch": 3.0719307733910224, + "grad_norm": 3.28125, + "learning_rate": 0.008154455346869975, + "loss": 3.0974, + "mean_token_accuracy": 0.4156627655029297, + "num_tokens": 5806582994.0, + "step": 11360 + }, + { + "epoch": 3.0722011898323416, + "grad_norm": 3.390625, + "learning_rate": 0.008152959772057265, + "loss": 2.93, + "mean_token_accuracy": 0.40641677379608154, + "num_tokens": 5807094865.0, + "step": 11361 + }, + { + "epoch": 3.0724716062736612, + "grad_norm": 3.546875, + "learning_rate": 0.008151464284600125, + "loss": 2.8081, + "mean_token_accuracy": 0.4031834304332733, + "num_tokens": 5807619016.0, + "step": 11362 + }, + { + "epoch": 3.072742022714981, + "grad_norm": 2.84375, + "learning_rate": 0.008149968884544438, + "loss": 3.0498, + "mean_token_accuracy": 0.41076231002807617, + "num_tokens": 5808143296.0, + "step": 11363 + }, + { + "epoch": 3.0730124391563005, + "grad_norm": 3.28125, + "learning_rate": 0.008148473571936083, + "loss": 3.0159, + "mean_token_accuracy": 0.4138414263725281, + "num_tokens": 5808650678.0, + "step": 11364 + }, + { + "epoch": 3.07328285559762, + "grad_norm": 2.875, + "learning_rate": 0.00814697834682095, + "loss": 2.9541, + "mean_token_accuracy": 0.4202507734298706, + "num_tokens": 5809127987.0, + "step": 11365 + }, + { + "epoch": 3.07355327203894, + "grad_norm": 3.171875, + "learning_rate": 0.008145483209244908, + "loss": 3.0508, + "mean_token_accuracy": 0.3923726975917816, + "num_tokens": 5809652114.0, + "step": 11366 + }, + { + "epoch": 3.0738236884802594, + "grad_norm": 3.125, + "learning_rate": 0.008143988159253837, + "loss": 3.1145, + "mean_token_accuracy": 0.4117603302001953, + "num_tokens": 5810176241.0, + "step": 11367 + }, + { + "epoch": 3.074094104921579, + "grad_norm": 4.4375, + "learning_rate": 0.008142493196893608, + "loss": 3.2157, + "mean_token_accuracy": 0.40084028244018555, + "num_tokens": 5810696007.0, + "step": 11368 + }, + { + "epoch": 3.0743645213628987, + "grad_norm": 3.5625, + "learning_rate": 0.008140998322210092, + "loss": 3.0909, + "mean_token_accuracy": 0.42363184690475464, + "num_tokens": 5811220244.0, + "step": 11369 + }, + { + "epoch": 3.0746349378042184, + "grad_norm": 3.46875, + "learning_rate": 0.008139503535249152, + "loss": 3.0632, + "mean_token_accuracy": 0.39412927627563477, + "num_tokens": 5811744443.0, + "step": 11370 + }, + { + "epoch": 3.074905354245538, + "grad_norm": 50.5, + "learning_rate": 0.008138008836056658, + "loss": 13.1165, + "mean_token_accuracy": 0.014062635600566864, + "num_tokens": 5812268656.0, + "step": 11371 + }, + { + "epoch": 3.0751757706868577, + "grad_norm": 5.84375, + "learning_rate": 0.008136514224678471, + "loss": 3.3611, + "mean_token_accuracy": 0.38510799407958984, + "num_tokens": 5812775027.0, + "step": 11372 + }, + { + "epoch": 3.0754461871281773, + "grad_norm": 2.328125, + "learning_rate": 0.008135019701160444, + "loss": 3.037, + "mean_token_accuracy": 0.4182230234146118, + "num_tokens": 5813299289.0, + "step": 11373 + }, + { + "epoch": 3.075716603569497, + "grad_norm": 3.28125, + "learning_rate": 0.00813352526554844, + "loss": 2.9957, + "mean_token_accuracy": 0.43036431074142456, + "num_tokens": 5813763484.0, + "step": 11374 + }, + { + "epoch": 3.0759870200108166, + "grad_norm": 3.625, + "learning_rate": 0.008132030917888314, + "loss": 2.969, + "mean_token_accuracy": 0.41454437375068665, + "num_tokens": 5814287762.0, + "step": 11375 + }, + { + "epoch": 3.0762574364521362, + "grad_norm": 3.34375, + "learning_rate": 0.008130536658225907, + "loss": 3.0249, + "mean_token_accuracy": 0.3997338116168976, + "num_tokens": 5814812000.0, + "step": 11376 + }, + { + "epoch": 3.076527852893456, + "grad_norm": 2.921875, + "learning_rate": 0.00812904248660708, + "loss": 2.7861, + "mean_token_accuracy": 0.425231397151947, + "num_tokens": 5815336134.0, + "step": 11377 + }, + { + "epoch": 3.0767982693347755, + "grad_norm": 2.921875, + "learning_rate": 0.008127548403077672, + "loss": 3.0759, + "mean_token_accuracy": 0.3904237151145935, + "num_tokens": 5815860390.0, + "step": 11378 + }, + { + "epoch": 3.077068685776095, + "grad_norm": 2.78125, + "learning_rate": 0.008126054407683521, + "loss": 3.0307, + "mean_token_accuracy": 0.4135962724685669, + "num_tokens": 5816384517.0, + "step": 11379 + }, + { + "epoch": 3.077339102217415, + "grad_norm": 2.9375, + "learning_rate": 0.008124560500470477, + "loss": 2.8204, + "mean_token_accuracy": 0.43030500411987305, + "num_tokens": 5816846439.0, + "step": 11380 + }, + { + "epoch": 3.0776095186587344, + "grad_norm": 2.15625, + "learning_rate": 0.008123066681484374, + "loss": 2.9638, + "mean_token_accuracy": 0.424465149641037, + "num_tokens": 5817344419.0, + "step": 11381 + }, + { + "epoch": 3.077879935100054, + "grad_norm": 2.703125, + "learning_rate": 0.00812157295077104, + "loss": 2.956, + "mean_token_accuracy": 0.43013328313827515, + "num_tokens": 5817868668.0, + "step": 11382 + }, + { + "epoch": 3.0781503515413737, + "grad_norm": 2.9375, + "learning_rate": 0.008120079308376318, + "loss": 3.1278, + "mean_token_accuracy": 0.4172474145889282, + "num_tokens": 5818392926.0, + "step": 11383 + }, + { + "epoch": 3.0784207679826934, + "grad_norm": 2.859375, + "learning_rate": 0.008118585754346033, + "loss": 3.0346, + "mean_token_accuracy": 0.4109842777252197, + "num_tokens": 5818917151.0, + "step": 11384 + }, + { + "epoch": 3.078691184424013, + "grad_norm": 2.5625, + "learning_rate": 0.008117092288726004, + "loss": 2.9617, + "mean_token_accuracy": 0.4184834063053131, + "num_tokens": 5819441397.0, + "step": 11385 + }, + { + "epoch": 3.0789616008653327, + "grad_norm": 2.453125, + "learning_rate": 0.00811559891156207, + "loss": 2.9871, + "mean_token_accuracy": 0.40626928210258484, + "num_tokens": 5819965566.0, + "step": 11386 + }, + { + "epoch": 3.0792320173066523, + "grad_norm": 3.109375, + "learning_rate": 0.008114105622900037, + "loss": 3.1269, + "mean_token_accuracy": 0.40478628873825073, + "num_tokens": 5820489740.0, + "step": 11387 + }, + { + "epoch": 3.079502433747972, + "grad_norm": 3.046875, + "learning_rate": 0.008112612422785735, + "loss": 3.0156, + "mean_token_accuracy": 0.4176103174686432, + "num_tokens": 5821006502.0, + "step": 11388 + }, + { + "epoch": 3.0797728501892916, + "grad_norm": 2.8125, + "learning_rate": 0.008111119311264972, + "loss": 3.0123, + "mean_token_accuracy": 0.4321499764919281, + "num_tokens": 5821484707.0, + "step": 11389 + }, + { + "epoch": 3.0800432666306112, + "grad_norm": 2.890625, + "learning_rate": 0.008109626288383565, + "loss": 3.0384, + "mean_token_accuracy": 0.4078691899776459, + "num_tokens": 5821961858.0, + "step": 11390 + }, + { + "epoch": 3.080313683071931, + "grad_norm": 28.0, + "learning_rate": 0.008108133354187325, + "loss": 13.5272, + "mean_token_accuracy": 0.0, + "num_tokens": 5822486010.0, + "step": 11391 + }, + { + "epoch": 3.0805840995132505, + "grad_norm": 6.03125, + "learning_rate": 0.00810664050872206, + "loss": 3.1656, + "mean_token_accuracy": 0.41888201236724854, + "num_tokens": 5822954326.0, + "step": 11392 + }, + { + "epoch": 3.08085451595457, + "grad_norm": 2.1875, + "learning_rate": 0.008105147752033566, + "loss": 3.0145, + "mean_token_accuracy": 0.40335512161254883, + "num_tokens": 5823478599.0, + "step": 11393 + }, + { + "epoch": 3.08112493239589, + "grad_norm": 2.65625, + "learning_rate": 0.008103655084167658, + "loss": 3.1418, + "mean_token_accuracy": 0.415173202753067, + "num_tokens": 5823981086.0, + "step": 11394 + }, + { + "epoch": 3.0813953488372094, + "grad_norm": 3.078125, + "learning_rate": 0.008102162505170128, + "loss": 3.0729, + "mean_token_accuracy": 0.4214324951171875, + "num_tokens": 5824469975.0, + "step": 11395 + }, + { + "epoch": 3.081665765278529, + "grad_norm": 3.625, + "learning_rate": 0.008100670015086773, + "loss": 2.9874, + "mean_token_accuracy": 0.45468443632125854, + "num_tokens": 5824929946.0, + "step": 11396 + }, + { + "epoch": 3.0819361817198487, + "grad_norm": 2.921875, + "learning_rate": 0.008099177613963391, + "loss": 2.9737, + "mean_token_accuracy": 0.4233490824699402, + "num_tokens": 5825427743.0, + "step": 11397 + }, + { + "epoch": 3.0822065981611684, + "grad_norm": 3.703125, + "learning_rate": 0.00809768530184577, + "loss": 3.0219, + "mean_token_accuracy": 0.4098733365535736, + "num_tokens": 5825952008.0, + "step": 11398 + }, + { + "epoch": 3.082477014602488, + "grad_norm": 2.78125, + "learning_rate": 0.008096193078779694, + "loss": 3.0528, + "mean_token_accuracy": 0.42973244190216064, + "num_tokens": 5826458861.0, + "step": 11399 + }, + { + "epoch": 3.0827474310438077, + "grad_norm": 3.046875, + "learning_rate": 0.008094700944810962, + "loss": 2.9376, + "mean_token_accuracy": 0.3986654579639435, + "num_tokens": 5826983089.0, + "step": 11400 + }, + { + "epoch": 3.0830178474851273, + "grad_norm": 2.0, + "learning_rate": 0.008093208899985344, + "loss": 3.1878, + "mean_token_accuracy": 0.3899868130683899, + "num_tokens": 5827496035.0, + "step": 11401 + }, + { + "epoch": 3.0832882639264465, + "grad_norm": 3.34375, + "learning_rate": 0.008091716944348626, + "loss": 3.0243, + "mean_token_accuracy": 0.41350433230400085, + "num_tokens": 5828020280.0, + "step": 11402 + }, + { + "epoch": 3.083558680367766, + "grad_norm": 3.125, + "learning_rate": 0.008090225077946584, + "loss": 3.045, + "mean_token_accuracy": 0.4089013934135437, + "num_tokens": 5828544400.0, + "step": 11403 + }, + { + "epoch": 3.083829096809086, + "grad_norm": 3.09375, + "learning_rate": 0.008088733300824998, + "loss": 3.1288, + "mean_token_accuracy": 0.42196810245513916, + "num_tokens": 5829027049.0, + "step": 11404 + }, + { + "epoch": 3.0840995132504054, + "grad_norm": 2.515625, + "learning_rate": 0.00808724161302963, + "loss": 2.8069, + "mean_token_accuracy": 0.4359102249145508, + "num_tokens": 5829551258.0, + "step": 11405 + }, + { + "epoch": 3.084369929691725, + "grad_norm": 2.921875, + "learning_rate": 0.00808575001460626, + "loss": 2.9504, + "mean_token_accuracy": 0.4261924624443054, + "num_tokens": 5830075356.0, + "step": 11406 + }, + { + "epoch": 3.0846403461330447, + "grad_norm": 2.890625, + "learning_rate": 0.008084258505600649, + "loss": 3.0696, + "mean_token_accuracy": 0.4119208753108978, + "num_tokens": 5830599532.0, + "step": 11407 + }, + { + "epoch": 3.0849107625743644, + "grad_norm": 2.71875, + "learning_rate": 0.008082767086058557, + "loss": 2.9935, + "mean_token_accuracy": 0.42667514085769653, + "num_tokens": 5831123772.0, + "step": 11408 + }, + { + "epoch": 3.085181179015684, + "grad_norm": 3.46875, + "learning_rate": 0.008081275756025751, + "loss": 3.0715, + "mean_token_accuracy": 0.4157429039478302, + "num_tokens": 5831610065.0, + "step": 11409 + }, + { + "epoch": 3.0854515954570036, + "grad_norm": 3.421875, + "learning_rate": 0.00807978451554799, + "loss": 3.17, + "mean_token_accuracy": 0.4031027555465698, + "num_tokens": 5832134286.0, + "step": 11410 + }, + { + "epoch": 3.0857220118983233, + "grad_norm": 90.5, + "learning_rate": 0.00807829336467103, + "loss": 11.9571, + "mean_token_accuracy": 0.008682711981236935, + "num_tokens": 5832658492.0, + "step": 11411 + }, + { + "epoch": 3.085992428339643, + "grad_norm": 7.625, + "learning_rate": 0.00807680230344062, + "loss": 3.2657, + "mean_token_accuracy": 0.39526402950286865, + "num_tokens": 5833153552.0, + "step": 11412 + }, + { + "epoch": 3.0862628447809626, + "grad_norm": 1.90625, + "learning_rate": 0.00807531133190251, + "loss": 3.0246, + "mean_token_accuracy": 0.4003792405128479, + "num_tokens": 5833677813.0, + "step": 11413 + }, + { + "epoch": 3.086533261222282, + "grad_norm": 2.734375, + "learning_rate": 0.008073820450102454, + "loss": 2.8382, + "mean_token_accuracy": 0.41072481870651245, + "num_tokens": 5834202092.0, + "step": 11414 + }, + { + "epoch": 3.086803677663602, + "grad_norm": 3.046875, + "learning_rate": 0.008072329658086188, + "loss": 3.1289, + "mean_token_accuracy": 0.38610467314720154, + "num_tokens": 5834726374.0, + "step": 11415 + }, + { + "epoch": 3.0870740941049215, + "grad_norm": 2.859375, + "learning_rate": 0.00807083895589946, + "loss": 3.1342, + "mean_token_accuracy": 0.4098179042339325, + "num_tokens": 5835239867.0, + "step": 11416 + }, + { + "epoch": 3.087344510546241, + "grad_norm": 2.90625, + "learning_rate": 0.008069348343588004, + "loss": 2.8135, + "mean_token_accuracy": 0.444480299949646, + "num_tokens": 5835684767.0, + "step": 11417 + }, + { + "epoch": 3.087614926987561, + "grad_norm": 2.859375, + "learning_rate": 0.008067857821197564, + "loss": 3.2316, + "mean_token_accuracy": 0.40112486481666565, + "num_tokens": 5836204788.0, + "step": 11418 + }, + { + "epoch": 3.0878853434288804, + "grad_norm": 2.421875, + "learning_rate": 0.008066367388773863, + "loss": 2.9988, + "mean_token_accuracy": 0.4069739580154419, + "num_tokens": 5836728994.0, + "step": 11419 + }, + { + "epoch": 3.0881557598702, + "grad_norm": 3.09375, + "learning_rate": 0.008064877046362643, + "loss": 3.0352, + "mean_token_accuracy": 0.4100714921951294, + "num_tokens": 5837243249.0, + "step": 11420 + }, + { + "epoch": 3.0884261763115197, + "grad_norm": 2.71875, + "learning_rate": 0.008063386794009626, + "loss": 3.0043, + "mean_token_accuracy": 0.38268154859542847, + "num_tokens": 5837767460.0, + "step": 11421 + }, + { + "epoch": 3.0886965927528394, + "grad_norm": 2.78125, + "learning_rate": 0.008061896631760535, + "loss": 3.0088, + "mean_token_accuracy": 0.4216473698616028, + "num_tokens": 5838254824.0, + "step": 11422 + }, + { + "epoch": 3.088967009194159, + "grad_norm": 3.28125, + "learning_rate": 0.008060406559661096, + "loss": 3.0203, + "mean_token_accuracy": 0.3987719416618347, + "num_tokens": 5838778902.0, + "step": 11423 + }, + { + "epoch": 3.0892374256354787, + "grad_norm": 3.25, + "learning_rate": 0.008058916577757032, + "loss": 2.828, + "mean_token_accuracy": 0.42913997173309326, + "num_tokens": 5839297409.0, + "step": 11424 + }, + { + "epoch": 3.0895078420767983, + "grad_norm": 3.625, + "learning_rate": 0.00805742668609405, + "loss": 2.7233, + "mean_token_accuracy": 0.4596892297267914, + "num_tokens": 5839821685.0, + "step": 11425 + }, + { + "epoch": 3.089778258518118, + "grad_norm": 2.515625, + "learning_rate": 0.00805593688471788, + "loss": 2.9048, + "mean_token_accuracy": 0.4017146825790405, + "num_tokens": 5840288323.0, + "step": 11426 + }, + { + "epoch": 3.0900486749594376, + "grad_norm": 3.53125, + "learning_rate": 0.00805444717367422, + "loss": 2.7889, + "mean_token_accuracy": 0.4311581254005432, + "num_tokens": 5840788657.0, + "step": 11427 + }, + { + "epoch": 3.0903190914007572, + "grad_norm": 3.34375, + "learning_rate": 0.008052957553008782, + "loss": 2.979, + "mean_token_accuracy": 0.4196600914001465, + "num_tokens": 5841312880.0, + "step": 11428 + }, + { + "epoch": 3.090589507842077, + "grad_norm": 3.859375, + "learning_rate": 0.008051468022767275, + "loss": 3.0338, + "mean_token_accuracy": 0.41821038722991943, + "num_tokens": 5841837049.0, + "step": 11429 + }, + { + "epoch": 3.0908599242833965, + "grad_norm": 2.9375, + "learning_rate": 0.008049978582995396, + "loss": 2.8271, + "mean_token_accuracy": 0.4280686676502228, + "num_tokens": 5842360994.0, + "step": 11430 + }, + { + "epoch": 3.091130340724716, + "grad_norm": 92.5, + "learning_rate": 0.008048489233738852, + "loss": 9.6422, + "mean_token_accuracy": 0.011339166201651096, + "num_tokens": 5842885217.0, + "step": 11431 + }, + { + "epoch": 3.091400757166036, + "grad_norm": 8.6875, + "learning_rate": 0.00804699997504334, + "loss": 3.4634, + "mean_token_accuracy": 0.4047636389732361, + "num_tokens": 5843344788.0, + "step": 11432 + }, + { + "epoch": 3.0916711736073554, + "grad_norm": 4.09375, + "learning_rate": 0.008045510806954547, + "loss": 3.0351, + "mean_token_accuracy": 0.40563443303108215, + "num_tokens": 5843832934.0, + "step": 11433 + }, + { + "epoch": 3.091941590048675, + "grad_norm": 5.15625, + "learning_rate": 0.008044021729518179, + "loss": 3.2656, + "mean_token_accuracy": 0.40400105714797974, + "num_tokens": 5844357164.0, + "step": 11434 + }, + { + "epoch": 3.0922120064899947, + "grad_norm": 3.890625, + "learning_rate": 0.008042532742779916, + "loss": 3.0909, + "mean_token_accuracy": 0.42201200127601624, + "num_tokens": 5844840989.0, + "step": 11435 + }, + { + "epoch": 3.0924824229313144, + "grad_norm": 3.421875, + "learning_rate": 0.008041043846785442, + "loss": 3.0886, + "mean_token_accuracy": 0.4130505323410034, + "num_tokens": 5845365265.0, + "step": 11436 + }, + { + "epoch": 3.092752839372634, + "grad_norm": 3.59375, + "learning_rate": 0.008039555041580448, + "loss": 3.0337, + "mean_token_accuracy": 0.42043203115463257, + "num_tokens": 5845889459.0, + "step": 11437 + }, + { + "epoch": 3.0930232558139537, + "grad_norm": 3.09375, + "learning_rate": 0.00803806632721061, + "loss": 3.0968, + "mean_token_accuracy": 0.4221501350402832, + "num_tokens": 5846344394.0, + "step": 11438 + }, + { + "epoch": 3.0932936722552733, + "grad_norm": 3.078125, + "learning_rate": 0.008036577703721603, + "loss": 3.0107, + "mean_token_accuracy": 0.41054123640060425, + "num_tokens": 5846868578.0, + "step": 11439 + }, + { + "epoch": 3.093564088696593, + "grad_norm": 3.21875, + "learning_rate": 0.008035089171159113, + "loss": 3.0839, + "mean_token_accuracy": 0.41410720348358154, + "num_tokens": 5847392755.0, + "step": 11440 + }, + { + "epoch": 3.0938345051379126, + "grad_norm": 3.171875, + "learning_rate": 0.008033600729568807, + "loss": 3.028, + "mean_token_accuracy": 0.431621253490448, + "num_tokens": 5847909342.0, + "step": 11441 + }, + { + "epoch": 3.0941049215792322, + "grad_norm": 3.265625, + "learning_rate": 0.008032112378996347, + "loss": 3.1292, + "mean_token_accuracy": 0.40904319286346436, + "num_tokens": 5848433621.0, + "step": 11442 + }, + { + "epoch": 3.0943753380205514, + "grad_norm": 2.984375, + "learning_rate": 0.008030624119487415, + "loss": 3.0327, + "mean_token_accuracy": 0.4399263858795166, + "num_tokens": 5848916234.0, + "step": 11443 + }, + { + "epoch": 3.094645754461871, + "grad_norm": 2.890625, + "learning_rate": 0.008029135951087663, + "loss": 2.8313, + "mean_token_accuracy": 0.4141737222671509, + "num_tokens": 5849440389.0, + "step": 11444 + }, + { + "epoch": 3.0949161709031907, + "grad_norm": 3.21875, + "learning_rate": 0.008027647873842756, + "loss": 3.0275, + "mean_token_accuracy": 0.4094047546386719, + "num_tokens": 5849964671.0, + "step": 11445 + }, + { + "epoch": 3.0951865873445104, + "grad_norm": 2.96875, + "learning_rate": 0.008026159887798357, + "loss": 3.0929, + "mean_token_accuracy": 0.38796764612197876, + "num_tokens": 5850488948.0, + "step": 11446 + }, + { + "epoch": 3.09545700378583, + "grad_norm": 3.1875, + "learning_rate": 0.008024671993000118, + "loss": 2.9899, + "mean_token_accuracy": 0.42070862650871277, + "num_tokens": 5851013052.0, + "step": 11447 + }, + { + "epoch": 3.0957274202271496, + "grad_norm": 3.59375, + "learning_rate": 0.008023184189493689, + "loss": 2.9673, + "mean_token_accuracy": 0.4274948835372925, + "num_tokens": 5851537057.0, + "step": 11448 + }, + { + "epoch": 3.0959978366684693, + "grad_norm": 2.703125, + "learning_rate": 0.008021696477324727, + "loss": 3.0136, + "mean_token_accuracy": 0.40718236565589905, + "num_tokens": 5852061246.0, + "step": 11449 + }, + { + "epoch": 3.096268253109789, + "grad_norm": 3.359375, + "learning_rate": 0.008020208856538874, + "loss": 2.9638, + "mean_token_accuracy": 0.42772042751312256, + "num_tokens": 5852573695.0, + "step": 11450 + }, + { + "epoch": 3.0965386695511086, + "grad_norm": 137.0, + "learning_rate": 0.008018721327181774, + "loss": 13.0773, + "mean_token_accuracy": 0.0013385785277932882, + "num_tokens": 5853097906.0, + "step": 11451 + }, + { + "epoch": 3.096809085992428, + "grad_norm": 5.59375, + "learning_rate": 0.008017233889299074, + "loss": 3.0873, + "mean_token_accuracy": 0.4023591876029968, + "num_tokens": 5853591486.0, + "step": 11452 + }, + { + "epoch": 3.097079502433748, + "grad_norm": 1.7890625, + "learning_rate": 0.008015746542936408, + "loss": 3.184, + "mean_token_accuracy": 0.41547155380249023, + "num_tokens": 5854086691.0, + "step": 11453 + }, + { + "epoch": 3.0973499188750675, + "grad_norm": 2.65625, + "learning_rate": 0.008014259288139414, + "loss": 2.9985, + "mean_token_accuracy": 0.4163188934326172, + "num_tokens": 5854610968.0, + "step": 11454 + }, + { + "epoch": 3.097620335316387, + "grad_norm": 2.328125, + "learning_rate": 0.00801277212495373, + "loss": 2.8738, + "mean_token_accuracy": 0.4111778736114502, + "num_tokens": 5855135183.0, + "step": 11455 + }, + { + "epoch": 3.097890751757707, + "grad_norm": 2.65625, + "learning_rate": 0.008011285053424974, + "loss": 2.9676, + "mean_token_accuracy": 0.4217899739742279, + "num_tokens": 5855659184.0, + "step": 11456 + }, + { + "epoch": 3.0981611681990264, + "grad_norm": 2.578125, + "learning_rate": 0.008009798073598789, + "loss": 2.7165, + "mean_token_accuracy": 0.45254039764404297, + "num_tokens": 5856183367.0, + "step": 11457 + }, + { + "epoch": 3.098431584640346, + "grad_norm": 3.0625, + "learning_rate": 0.008008311185520792, + "loss": 2.9569, + "mean_token_accuracy": 0.4131215214729309, + "num_tokens": 5856707614.0, + "step": 11458 + }, + { + "epoch": 3.0987020010816657, + "grad_norm": 3.40625, + "learning_rate": 0.008006824389236602, + "loss": 3.1295, + "mean_token_accuracy": 0.3975864052772522, + "num_tokens": 5857231698.0, + "step": 11459 + }, + { + "epoch": 3.0989724175229854, + "grad_norm": 2.671875, + "learning_rate": 0.008005337684791844, + "loss": 2.8048, + "mean_token_accuracy": 0.4195145070552826, + "num_tokens": 5857755983.0, + "step": 11460 + }, + { + "epoch": 3.099242833964305, + "grad_norm": 2.796875, + "learning_rate": 0.008003851072232133, + "loss": 2.9908, + "mean_token_accuracy": 0.41592901945114136, + "num_tokens": 5858224152.0, + "step": 11461 + }, + { + "epoch": 3.0995132504056246, + "grad_norm": 3.046875, + "learning_rate": 0.00800236455160308, + "loss": 3.0354, + "mean_token_accuracy": 0.4059321880340576, + "num_tokens": 5858748301.0, + "step": 11462 + }, + { + "epoch": 3.0997836668469443, + "grad_norm": 3.15625, + "learning_rate": 0.008000878122950304, + "loss": 2.8874, + "mean_token_accuracy": 0.4242427349090576, + "num_tokens": 5859272470.0, + "step": 11463 + }, + { + "epoch": 3.100054083288264, + "grad_norm": 3.09375, + "learning_rate": 0.007999391786319403, + "loss": 3.0578, + "mean_token_accuracy": 0.40113717317581177, + "num_tokens": 5859796566.0, + "step": 11464 + }, + { + "epoch": 3.1003244997295836, + "grad_norm": 3.84375, + "learning_rate": 0.007997905541755985, + "loss": 2.8081, + "mean_token_accuracy": 0.45419585704803467, + "num_tokens": 5860288959.0, + "step": 11465 + }, + { + "epoch": 3.100594916170903, + "grad_norm": 2.515625, + "learning_rate": 0.007996419389305653, + "loss": 3.2056, + "mean_token_accuracy": 0.38342538475990295, + "num_tokens": 5860813238.0, + "step": 11466 + }, + { + "epoch": 3.100865332612223, + "grad_norm": 3.375, + "learning_rate": 0.007994933329014011, + "loss": 2.8671, + "mean_token_accuracy": 0.4782973527908325, + "num_tokens": 5861211755.0, + "step": 11467 + }, + { + "epoch": 3.1011357490535425, + "grad_norm": 3.515625, + "learning_rate": 0.007993447360926649, + "loss": 2.5545, + "mean_token_accuracy": 0.44226324558258057, + "num_tokens": 5861718350.0, + "step": 11468 + }, + { + "epoch": 3.101406165494862, + "grad_norm": 2.28125, + "learning_rate": 0.007991961485089164, + "loss": 3.0892, + "mean_token_accuracy": 0.40828970074653625, + "num_tokens": 5862242534.0, + "step": 11469 + }, + { + "epoch": 3.101676581936182, + "grad_norm": 3.21875, + "learning_rate": 0.007990475701547149, + "loss": 2.9739, + "mean_token_accuracy": 0.4068874716758728, + "num_tokens": 5862766753.0, + "step": 11470 + }, + { + "epoch": 3.1019469983775014, + "grad_norm": 73.0, + "learning_rate": 0.007988990010346187, + "loss": 22.6045, + "mean_token_accuracy": 0.0002516834647394717, + "num_tokens": 5863290919.0, + "step": 11471 + }, + { + "epoch": 3.102217414818821, + "grad_norm": 6.75, + "learning_rate": 0.007987504411531867, + "loss": 3.3723, + "mean_token_accuracy": 0.36476343870162964, + "num_tokens": 5863763204.0, + "step": 11472 + }, + { + "epoch": 3.1024878312601407, + "grad_norm": 2.53125, + "learning_rate": 0.00798601890514977, + "loss": 3.1386, + "mean_token_accuracy": 0.4029502868652344, + "num_tokens": 5864287433.0, + "step": 11473 + }, + { + "epoch": 3.1027582477014604, + "grad_norm": 2.96875, + "learning_rate": 0.007984533491245475, + "loss": 2.9718, + "mean_token_accuracy": 0.3924541473388672, + "num_tokens": 5864811626.0, + "step": 11474 + }, + { + "epoch": 3.10302866414278, + "grad_norm": 3.078125, + "learning_rate": 0.007983048169864566, + "loss": 3.0461, + "mean_token_accuracy": 0.39809250831604004, + "num_tokens": 5865335849.0, + "step": 11475 + }, + { + "epoch": 3.1032990805840996, + "grad_norm": 2.984375, + "learning_rate": 0.007981562941052612, + "loss": 3.2133, + "mean_token_accuracy": 0.39538276195526123, + "num_tokens": 5865860050.0, + "step": 11476 + }, + { + "epoch": 3.1035694970254193, + "grad_norm": 2.390625, + "learning_rate": 0.007980077804855178, + "loss": 2.8966, + "mean_token_accuracy": 0.42098772525787354, + "num_tokens": 5866384194.0, + "step": 11477 + }, + { + "epoch": 3.103839913466739, + "grad_norm": 2.9375, + "learning_rate": 0.00797859276131784, + "loss": 3.0296, + "mean_token_accuracy": 0.3883473575115204, + "num_tokens": 5866908360.0, + "step": 11478 + }, + { + "epoch": 3.1041103299080586, + "grad_norm": 3.25, + "learning_rate": 0.00797710781048616, + "loss": 3.0202, + "mean_token_accuracy": 0.4115670919418335, + "num_tokens": 5867432498.0, + "step": 11479 + }, + { + "epoch": 3.104380746349378, + "grad_norm": 3.03125, + "learning_rate": 0.007975622952405703, + "loss": 3.0515, + "mean_token_accuracy": 0.41380631923675537, + "num_tokens": 5867956777.0, + "step": 11480 + }, + { + "epoch": 3.104651162790698, + "grad_norm": 4.0625, + "learning_rate": 0.00797413818712203, + "loss": 2.9831, + "mean_token_accuracy": 0.4371962547302246, + "num_tokens": 5868458215.0, + "step": 11481 + }, + { + "epoch": 3.1049215792320175, + "grad_norm": 2.875, + "learning_rate": 0.007972653514680692, + "loss": 2.9078, + "mean_token_accuracy": 0.42888790369033813, + "num_tokens": 5868982341.0, + "step": 11482 + }, + { + "epoch": 3.105191995673337, + "grad_norm": 3.796875, + "learning_rate": 0.00797116893512725, + "loss": 3.2127, + "mean_token_accuracy": 0.380520761013031, + "num_tokens": 5869506613.0, + "step": 11483 + }, + { + "epoch": 3.1054624121146563, + "grad_norm": 2.578125, + "learning_rate": 0.007969684448507253, + "loss": 2.9489, + "mean_token_accuracy": 0.4130815863609314, + "num_tokens": 5870030865.0, + "step": 11484 + }, + { + "epoch": 3.105732828555976, + "grad_norm": 2.875, + "learning_rate": 0.007968200054866245, + "loss": 2.957, + "mean_token_accuracy": 0.41772735118865967, + "num_tokens": 5870555037.0, + "step": 11485 + }, + { + "epoch": 3.1060032449972956, + "grad_norm": 2.375, + "learning_rate": 0.00796671575424978, + "loss": 2.9686, + "mean_token_accuracy": 0.41390514373779297, + "num_tokens": 5871079249.0, + "step": 11486 + }, + { + "epoch": 3.1062736614386153, + "grad_norm": 3.015625, + "learning_rate": 0.007965231546703396, + "loss": 2.9943, + "mean_token_accuracy": 0.4115339517593384, + "num_tokens": 5871603521.0, + "step": 11487 + }, + { + "epoch": 3.106544077879935, + "grad_norm": 2.953125, + "learning_rate": 0.007963747432272628, + "loss": 2.7797, + "mean_token_accuracy": 0.451118141412735, + "num_tokens": 5872067639.0, + "step": 11488 + }, + { + "epoch": 3.1068144943212546, + "grad_norm": 3.328125, + "learning_rate": 0.007962263411003019, + "loss": 2.9811, + "mean_token_accuracy": 0.4099392294883728, + "num_tokens": 5872591716.0, + "step": 11489 + }, + { + "epoch": 3.107084910762574, + "grad_norm": 3.59375, + "learning_rate": 0.007960779482940106, + "loss": 2.8524, + "mean_token_accuracy": 0.42151975631713867, + "num_tokens": 5873013603.0, + "step": 11490 + }, + { + "epoch": 3.107355327203894, + "grad_norm": 44.25, + "learning_rate": 0.00795929564812941, + "loss": 11.1149, + "mean_token_accuracy": 0.009604569524526596, + "num_tokens": 5873537880.0, + "step": 11491 + }, + { + "epoch": 3.1076257436452135, + "grad_norm": 7.75, + "learning_rate": 0.007957811906616472, + "loss": 3.0906, + "mean_token_accuracy": 0.3997266888618469, + "num_tokens": 5874036887.0, + "step": 11492 + }, + { + "epoch": 3.107896160086533, + "grad_norm": 2.0625, + "learning_rate": 0.007956328258446808, + "loss": 3.0711, + "mean_token_accuracy": 0.4093219041824341, + "num_tokens": 5874561142.0, + "step": 11493 + }, + { + "epoch": 3.1081665765278528, + "grad_norm": 3.15625, + "learning_rate": 0.007954844703665944, + "loss": 2.9927, + "mean_token_accuracy": 0.4309298098087311, + "num_tokens": 5875024739.0, + "step": 11494 + }, + { + "epoch": 3.1084369929691724, + "grad_norm": 3.046875, + "learning_rate": 0.0079533612423194, + "loss": 2.9751, + "mean_token_accuracy": 0.4124204218387604, + "num_tokens": 5875541157.0, + "step": 11495 + }, + { + "epoch": 3.108707409410492, + "grad_norm": 2.96875, + "learning_rate": 0.007951877874452694, + "loss": 2.8327, + "mean_token_accuracy": 0.44878554344177246, + "num_tokens": 5876040634.0, + "step": 11496 + }, + { + "epoch": 3.1089778258518117, + "grad_norm": 3.125, + "learning_rate": 0.007950394600111335, + "loss": 2.9916, + "mean_token_accuracy": 0.4177525043487549, + "num_tokens": 5876564859.0, + "step": 11497 + }, + { + "epoch": 3.1092482422931313, + "grad_norm": 3.0, + "learning_rate": 0.007948911419340841, + "loss": 3.0526, + "mean_token_accuracy": 0.39509648084640503, + "num_tokens": 5877089024.0, + "step": 11498 + }, + { + "epoch": 3.109518658734451, + "grad_norm": 3.359375, + "learning_rate": 0.007947428332186719, + "loss": 3.0919, + "mean_token_accuracy": 0.40123674273490906, + "num_tokens": 5877613146.0, + "step": 11499 + }, + { + "epoch": 3.1097890751757706, + "grad_norm": 2.734375, + "learning_rate": 0.007945945338694468, + "loss": 2.8822, + "mean_token_accuracy": 0.4215506911277771, + "num_tokens": 5878113471.0, + "step": 11500 + }, + { + "epoch": 3.1100594916170903, + "grad_norm": 2.46875, + "learning_rate": 0.007944462438909597, + "loss": 2.8201, + "mean_token_accuracy": 0.42902031540870667, + "num_tokens": 5878637679.0, + "step": 11501 + }, + { + "epoch": 3.11032990805841, + "grad_norm": 3.015625, + "learning_rate": 0.007942979632877606, + "loss": 2.9559, + "mean_token_accuracy": 0.40179187059402466, + "num_tokens": 5879161954.0, + "step": 11502 + }, + { + "epoch": 3.1106003244997296, + "grad_norm": 2.875, + "learning_rate": 0.007941496920643987, + "loss": 2.7177, + "mean_token_accuracy": 0.43406814336776733, + "num_tokens": 5879686224.0, + "step": 11503 + }, + { + "epoch": 3.110870740941049, + "grad_norm": 3.203125, + "learning_rate": 0.007940014302254241, + "loss": 2.9078, + "mean_token_accuracy": 0.41155746579170227, + "num_tokens": 5880210454.0, + "step": 11504 + }, + { + "epoch": 3.111141157382369, + "grad_norm": 2.421875, + "learning_rate": 0.00793853177775385, + "loss": 2.8905, + "mean_token_accuracy": 0.4460183382034302, + "num_tokens": 5880694973.0, + "step": 11505 + }, + { + "epoch": 3.1114115738236885, + "grad_norm": 3.0, + "learning_rate": 0.007937049347188313, + "loss": 2.9069, + "mean_token_accuracy": 0.412966251373291, + "num_tokens": 5881219242.0, + "step": 11506 + }, + { + "epoch": 3.111681990265008, + "grad_norm": 3.015625, + "learning_rate": 0.007935567010603106, + "loss": 3.0403, + "mean_token_accuracy": 0.42074644565582275, + "num_tokens": 5881703546.0, + "step": 11507 + }, + { + "epoch": 3.111952406706328, + "grad_norm": 2.8125, + "learning_rate": 0.007934084768043715, + "loss": 2.7905, + "mean_token_accuracy": 0.4225429892539978, + "num_tokens": 5882227809.0, + "step": 11508 + }, + { + "epoch": 3.1122228231476474, + "grad_norm": 2.546875, + "learning_rate": 0.00793260261955562, + "loss": 3.069, + "mean_token_accuracy": 0.40018701553344727, + "num_tokens": 5882751955.0, + "step": 11509 + }, + { + "epoch": 3.112493239588967, + "grad_norm": 3.1875, + "learning_rate": 0.0079311205651843, + "loss": 2.9482, + "mean_token_accuracy": 0.412032812833786, + "num_tokens": 5883276131.0, + "step": 11510 + }, + { + "epoch": 3.1127636560302867, + "grad_norm": 3.328125, + "learning_rate": 0.00792963860497522, + "loss": 12.0141, + "mean_token_accuracy": 0.00019053129653912038, + "num_tokens": 5883788958.0, + "step": 11511 + }, + { + "epoch": 3.1130340724716064, + "grad_norm": 7.0, + "learning_rate": 0.007928156738973863, + "loss": 3.3731, + "mean_token_accuracy": 0.3925752639770508, + "num_tokens": 5884313226.0, + "step": 11512 + }, + { + "epoch": 3.113304488912926, + "grad_norm": 2.65625, + "learning_rate": 0.007926674967225688, + "loss": 3.1491, + "mean_token_accuracy": 0.38828033208847046, + "num_tokens": 5884837414.0, + "step": 11513 + }, + { + "epoch": 3.1135749053542456, + "grad_norm": 3.0, + "learning_rate": 0.007925193289776162, + "loss": 3.0123, + "mean_token_accuracy": 0.4081806540489197, + "num_tokens": 5885361650.0, + "step": 11514 + }, + { + "epoch": 3.1138453217955653, + "grad_norm": 2.703125, + "learning_rate": 0.007923711706670751, + "loss": 2.8616, + "mean_token_accuracy": 0.3980366587638855, + "num_tokens": 5885885857.0, + "step": 11515 + }, + { + "epoch": 3.114115738236885, + "grad_norm": 2.765625, + "learning_rate": 0.007922230217954912, + "loss": 2.8288, + "mean_token_accuracy": 0.43672072887420654, + "num_tokens": 5886374313.0, + "step": 11516 + }, + { + "epoch": 3.1143861546782046, + "grad_norm": 3.125, + "learning_rate": 0.007920748823674098, + "loss": 3.0327, + "mean_token_accuracy": 0.4082765281200409, + "num_tokens": 5886898548.0, + "step": 11517 + }, + { + "epoch": 3.114656571119524, + "grad_norm": 2.65625, + "learning_rate": 0.007919267523873768, + "loss": 2.8932, + "mean_token_accuracy": 0.41010570526123047, + "num_tokens": 5887422709.0, + "step": 11518 + }, + { + "epoch": 3.114926987560844, + "grad_norm": 2.5, + "learning_rate": 0.007917786318599369, + "loss": 2.8705, + "mean_token_accuracy": 0.4203854203224182, + "num_tokens": 5887929438.0, + "step": 11519 + }, + { + "epoch": 3.1151974040021635, + "grad_norm": 2.8125, + "learning_rate": 0.00791630520789635, + "loss": 2.9779, + "mean_token_accuracy": 0.4002988040447235, + "num_tokens": 5888453716.0, + "step": 11520 + }, + { + "epoch": 3.115467820443483, + "grad_norm": 2.921875, + "learning_rate": 0.007914824191810157, + "loss": 3.0646, + "mean_token_accuracy": 0.4195305109024048, + "num_tokens": 5888973929.0, + "step": 11521 + }, + { + "epoch": 3.115738236884803, + "grad_norm": 2.875, + "learning_rate": 0.00791334327038623, + "loss": 3.0962, + "mean_token_accuracy": 0.41184690594673157, + "num_tokens": 5889498100.0, + "step": 11522 + }, + { + "epoch": 3.1160086533261224, + "grad_norm": 3.046875, + "learning_rate": 0.007911862443670006, + "loss": 2.9477, + "mean_token_accuracy": 0.4428301453590393, + "num_tokens": 5890022355.0, + "step": 11523 + }, + { + "epoch": 3.116279069767442, + "grad_norm": 2.65625, + "learning_rate": 0.007910381711706929, + "loss": 2.9675, + "mean_token_accuracy": 0.4150925576686859, + "num_tokens": 5890546617.0, + "step": 11524 + }, + { + "epoch": 3.1165494862087613, + "grad_norm": 3.203125, + "learning_rate": 0.00790890107454242, + "loss": 3.0507, + "mean_token_accuracy": 0.41248488426208496, + "num_tokens": 5891070894.0, + "step": 11525 + }, + { + "epoch": 3.1168199026500814, + "grad_norm": 2.8125, + "learning_rate": 0.007907420532221923, + "loss": 2.922, + "mean_token_accuracy": 0.4031555950641632, + "num_tokens": 5891558045.0, + "step": 11526 + }, + { + "epoch": 3.1170903190914006, + "grad_norm": 2.15625, + "learning_rate": 0.007905940084790859, + "loss": 3.077, + "mean_token_accuracy": 0.40601637959480286, + "num_tokens": 5892082269.0, + "step": 11527 + }, + { + "epoch": 3.11736073553272, + "grad_norm": 3.53125, + "learning_rate": 0.007904459732294646, + "loss": 3.0206, + "mean_token_accuracy": 0.41624680161476135, + "num_tokens": 5892573381.0, + "step": 11528 + }, + { + "epoch": 3.11763115197404, + "grad_norm": 3.515625, + "learning_rate": 0.007902979474778717, + "loss": 3.0876, + "mean_token_accuracy": 0.41356536746025085, + "num_tokens": 5893097602.0, + "step": 11529 + }, + { + "epoch": 3.1179015684153595, + "grad_norm": 4.3125, + "learning_rate": 0.007901499312288484, + "loss": 3.0224, + "mean_token_accuracy": 0.39922258257865906, + "num_tokens": 5893509627.0, + "step": 11530 + }, + { + "epoch": 3.118171984856679, + "grad_norm": 25.875, + "learning_rate": 0.007900019244869364, + "loss": 10.4011, + "mean_token_accuracy": 0.010890249162912369, + "num_tokens": 5894033891.0, + "step": 11531 + }, + { + "epoch": 3.1184424012979988, + "grad_norm": 5.8125, + "learning_rate": 0.007898539272566769, + "loss": 3.1809, + "mean_token_accuracy": 0.40780559182167053, + "num_tokens": 5894522825.0, + "step": 11532 + }, + { + "epoch": 3.1187128177393184, + "grad_norm": 2.640625, + "learning_rate": 0.007897059395426113, + "loss": 2.8889, + "mean_token_accuracy": 0.40745896100997925, + "num_tokens": 5895047078.0, + "step": 11533 + }, + { + "epoch": 3.118983234180638, + "grad_norm": 3.078125, + "learning_rate": 0.007895579613492796, + "loss": 2.8938, + "mean_token_accuracy": 0.41993874311447144, + "num_tokens": 5895571243.0, + "step": 11534 + }, + { + "epoch": 3.1192536506219577, + "grad_norm": 3.4375, + "learning_rate": 0.007894099926812231, + "loss": 2.9042, + "mean_token_accuracy": 0.4019608199596405, + "num_tokens": 5896092217.0, + "step": 11535 + }, + { + "epoch": 3.1195240670632773, + "grad_norm": 2.9375, + "learning_rate": 0.007892620335429812, + "loss": 3.1956, + "mean_token_accuracy": 0.39510709047317505, + "num_tokens": 5896616501.0, + "step": 11536 + }, + { + "epoch": 3.119794483504597, + "grad_norm": 3.703125, + "learning_rate": 0.007891140839390935, + "loss": 2.9198, + "mean_token_accuracy": 0.45304805040359497, + "num_tokens": 5897130044.0, + "step": 11537 + }, + { + "epoch": 3.1200648999459166, + "grad_norm": 2.75, + "learning_rate": 0.007889661438741005, + "loss": 3.0291, + "mean_token_accuracy": 0.4165034890174866, + "num_tokens": 5897654268.0, + "step": 11538 + }, + { + "epoch": 3.1203353163872363, + "grad_norm": 2.609375, + "learning_rate": 0.007888182133525408, + "loss": 2.735, + "mean_token_accuracy": 0.4334699213504791, + "num_tokens": 5898178485.0, + "step": 11539 + }, + { + "epoch": 3.120605732828556, + "grad_norm": 2.46875, + "learning_rate": 0.00788670292378953, + "loss": 2.871, + "mean_token_accuracy": 0.40641435980796814, + "num_tokens": 5898702510.0, + "step": 11540 + }, + { + "epoch": 3.1208761492698756, + "grad_norm": 2.90625, + "learning_rate": 0.007885223809578767, + "loss": 2.9186, + "mean_token_accuracy": 0.421718567609787, + "num_tokens": 5899226715.0, + "step": 11541 + }, + { + "epoch": 3.121146565711195, + "grad_norm": 4.1875, + "learning_rate": 0.007883744790938499, + "loss": 3.0475, + "mean_token_accuracy": 0.4187898337841034, + "num_tokens": 5899750794.0, + "step": 11542 + }, + { + "epoch": 3.121416982152515, + "grad_norm": 3.25, + "learning_rate": 0.007882265867914098, + "loss": 2.9763, + "mean_token_accuracy": 0.41880688071250916, + "num_tokens": 5900275068.0, + "step": 11543 + }, + { + "epoch": 3.1216873985938345, + "grad_norm": 3.578125, + "learning_rate": 0.007880787040550953, + "loss": 2.8634, + "mean_token_accuracy": 0.4006307125091553, + "num_tokens": 5900799323.0, + "step": 11544 + }, + { + "epoch": 3.121957815035154, + "grad_norm": 3.078125, + "learning_rate": 0.007879308308894438, + "loss": 2.8419, + "mean_token_accuracy": 0.4276103973388672, + "num_tokens": 5901297682.0, + "step": 11545 + }, + { + "epoch": 3.1222282314764738, + "grad_norm": 3.234375, + "learning_rate": 0.007877829672989913, + "loss": 3.0379, + "mean_token_accuracy": 0.40851324796676636, + "num_tokens": 5901821889.0, + "step": 11546 + }, + { + "epoch": 3.1224986479177934, + "grad_norm": 3.046875, + "learning_rate": 0.007876351132882762, + "loss": 2.8788, + "mean_token_accuracy": 0.42048704624176025, + "num_tokens": 5902346057.0, + "step": 11547 + }, + { + "epoch": 3.122769064359113, + "grad_norm": 3.0625, + "learning_rate": 0.00787487268861834, + "loss": 3.1715, + "mean_token_accuracy": 0.4007621705532074, + "num_tokens": 5902870149.0, + "step": 11548 + }, + { + "epoch": 3.1230394808004327, + "grad_norm": 3.5, + "learning_rate": 0.00787339434024202, + "loss": 3.0175, + "mean_token_accuracy": 0.40502071380615234, + "num_tokens": 5903394377.0, + "step": 11549 + }, + { + "epoch": 3.1233098972417523, + "grad_norm": 2.609375, + "learning_rate": 0.007871916087799151, + "loss": 2.7719, + "mean_token_accuracy": 0.4055200219154358, + "num_tokens": 5903881477.0, + "step": 11550 + }, + { + "epoch": 3.123580313683072, + "grad_norm": 5.3125, + "learning_rate": 0.007870437931335099, + "loss": 9.9599, + "mean_token_accuracy": 3.290685981482966e-06, + "num_tokens": 5904405743.0, + "step": 11551 + }, + { + "epoch": 3.1238507301243916, + "grad_norm": 5.875, + "learning_rate": 0.007868959870895212, + "loss": 2.9749, + "mean_token_accuracy": 0.40760812163352966, + "num_tokens": 5904929971.0, + "step": 11552 + }, + { + "epoch": 3.1241211465657113, + "grad_norm": 2.328125, + "learning_rate": 0.007867481906524847, + "loss": 2.8698, + "mean_token_accuracy": 0.4163067936897278, + "num_tokens": 5905454257.0, + "step": 11553 + }, + { + "epoch": 3.124391563007031, + "grad_norm": 3.375, + "learning_rate": 0.007866004038269345, + "loss": 3.0915, + "mean_token_accuracy": 0.3912959694862366, + "num_tokens": 5905978455.0, + "step": 11554 + }, + { + "epoch": 3.1246619794483506, + "grad_norm": 3.6875, + "learning_rate": 0.007864526266174062, + "loss": 2.7497, + "mean_token_accuracy": 0.4185928404331207, + "num_tokens": 5906502671.0, + "step": 11555 + }, + { + "epoch": 3.12493239588967, + "grad_norm": 2.453125, + "learning_rate": 0.00786304859028433, + "loss": 2.9767, + "mean_token_accuracy": 0.43132954835891724, + "num_tokens": 5906931463.0, + "step": 11556 + }, + { + "epoch": 3.12520281233099, + "grad_norm": 2.984375, + "learning_rate": 0.007861571010645493, + "loss": 2.8182, + "mean_token_accuracy": 0.4322190284729004, + "num_tokens": 5907455738.0, + "step": 11557 + }, + { + "epoch": 3.1254732287723095, + "grad_norm": 2.984375, + "learning_rate": 0.007860093527302882, + "loss": 2.8714, + "mean_token_accuracy": 0.42896267771720886, + "num_tokens": 5907979969.0, + "step": 11558 + }, + { + "epoch": 3.125743645213629, + "grad_norm": 2.890625, + "learning_rate": 0.007858616140301844, + "loss": 2.9017, + "mean_token_accuracy": 0.40600481629371643, + "num_tokens": 5908504143.0, + "step": 11559 + }, + { + "epoch": 3.1260140616549488, + "grad_norm": 2.984375, + "learning_rate": 0.007857138849687693, + "loss": 2.8771, + "mean_token_accuracy": 0.4355664849281311, + "num_tokens": 5909028227.0, + "step": 11560 + }, + { + "epoch": 3.1262844780962684, + "grad_norm": 2.796875, + "learning_rate": 0.007855661655505771, + "loss": 2.9124, + "mean_token_accuracy": 0.4145891070365906, + "num_tokens": 5909552495.0, + "step": 11561 + }, + { + "epoch": 3.126554894537588, + "grad_norm": 2.796875, + "learning_rate": 0.007854184557801396, + "loss": 2.9334, + "mean_token_accuracy": 0.4122479259967804, + "num_tokens": 5910036700.0, + "step": 11562 + }, + { + "epoch": 3.1268253109789077, + "grad_norm": 2.796875, + "learning_rate": 0.007852707556619884, + "loss": 2.9709, + "mean_token_accuracy": 0.41431349515914917, + "num_tokens": 5910560968.0, + "step": 11563 + }, + { + "epoch": 3.1270957274202273, + "grad_norm": 3.0625, + "learning_rate": 0.007851230652006568, + "loss": 3.0383, + "mean_token_accuracy": 0.4210553765296936, + "num_tokens": 5911085200.0, + "step": 11564 + }, + { + "epoch": 3.127366143861547, + "grad_norm": 3.109375, + "learning_rate": 0.00784975384400675, + "loss": 2.898, + "mean_token_accuracy": 0.4096870422363281, + "num_tokens": 5911598983.0, + "step": 11565 + }, + { + "epoch": 3.127636560302866, + "grad_norm": 3.140625, + "learning_rate": 0.007848277132665747, + "loss": 2.8074, + "mean_token_accuracy": 0.4194115698337555, + "num_tokens": 5912123261.0, + "step": 11566 + }, + { + "epoch": 3.1279069767441863, + "grad_norm": 2.46875, + "learning_rate": 0.007846800518028874, + "loss": 3.1554, + "mean_token_accuracy": 0.40495139360427856, + "num_tokens": 5912647513.0, + "step": 11567 + }, + { + "epoch": 3.1281773931855055, + "grad_norm": 4.09375, + "learning_rate": 0.007845324000141434, + "loss": 2.9293, + "mean_token_accuracy": 0.4218977689743042, + "num_tokens": 5913171650.0, + "step": 11568 + }, + { + "epoch": 3.128447809626825, + "grad_norm": 2.15625, + "learning_rate": 0.007843847579048728, + "loss": 2.8554, + "mean_token_accuracy": 0.4342803657054901, + "num_tokens": 5913639738.0, + "step": 11569 + }, + { + "epoch": 3.1287182260681448, + "grad_norm": 2.75, + "learning_rate": 0.007842371254796063, + "loss": 2.9308, + "mean_token_accuracy": 0.4155426323413849, + "num_tokens": 5914163858.0, + "step": 11570 + }, + { + "epoch": 3.1289886425094644, + "grad_norm": 135.0, + "learning_rate": 0.00784089502742873, + "loss": 16.7059, + "mean_token_accuracy": 0.029460597783327103, + "num_tokens": 5914688068.0, + "step": 11571 + }, + { + "epoch": 3.129259058950784, + "grad_norm": 6.125, + "learning_rate": 0.007839418896992032, + "loss": 3.3779, + "mean_token_accuracy": 0.3464046120643616, + "num_tokens": 5915212253.0, + "step": 11572 + }, + { + "epoch": 3.1295294753921037, + "grad_norm": 2.0, + "learning_rate": 0.007837942863531256, + "loss": 3.1035, + "mean_token_accuracy": 0.407829225063324, + "num_tokens": 5915736428.0, + "step": 11573 + }, + { + "epoch": 3.1297998918334233, + "grad_norm": 2.828125, + "learning_rate": 0.007836466927091688, + "loss": 3.0309, + "mean_token_accuracy": 0.3921542763710022, + "num_tokens": 5916260715.0, + "step": 11574 + }, + { + "epoch": 3.130070308274743, + "grad_norm": 2.890625, + "learning_rate": 0.007834991087718622, + "loss": 3.1451, + "mean_token_accuracy": 0.40116357803344727, + "num_tokens": 5916784978.0, + "step": 11575 + }, + { + "epoch": 3.1303407247160626, + "grad_norm": 3.171875, + "learning_rate": 0.007833515345457339, + "loss": 3.0449, + "mean_token_accuracy": 0.40730687975883484, + "num_tokens": 5917309119.0, + "step": 11576 + }, + { + "epoch": 3.1306111411573823, + "grad_norm": 2.96875, + "learning_rate": 0.007832039700353111, + "loss": 3.157, + "mean_token_accuracy": 0.4219600260257721, + "num_tokens": 5917784412.0, + "step": 11577 + }, + { + "epoch": 3.130881557598702, + "grad_norm": 2.765625, + "learning_rate": 0.007830564152451227, + "loss": 2.8556, + "mean_token_accuracy": 0.40838056802749634, + "num_tokens": 5918308581.0, + "step": 11578 + }, + { + "epoch": 3.1311519740400215, + "grad_norm": 5.375, + "learning_rate": 0.007829088701796954, + "loss": 2.6162, + "mean_token_accuracy": 0.49491238594055176, + "num_tokens": 5918832710.0, + "step": 11579 + }, + { + "epoch": 3.131422390481341, + "grad_norm": 2.109375, + "learning_rate": 0.007827613348435562, + "loss": 3.0401, + "mean_token_accuracy": 0.4089927673339844, + "num_tokens": 5919356953.0, + "step": 11580 + }, + { + "epoch": 3.131692806922661, + "grad_norm": 3.609375, + "learning_rate": 0.007826138092412324, + "loss": 3.0577, + "mean_token_accuracy": 0.426004022359848, + "num_tokens": 5919832825.0, + "step": 11581 + }, + { + "epoch": 3.1319632233639805, + "grad_norm": 2.671875, + "learning_rate": 0.007824662933772507, + "loss": 2.859, + "mean_token_accuracy": 0.4266982078552246, + "num_tokens": 5920356913.0, + "step": 11582 + }, + { + "epoch": 3.1322336398053, + "grad_norm": 2.796875, + "learning_rate": 0.007823187872561362, + "loss": 3.0332, + "mean_token_accuracy": 0.4211461842060089, + "num_tokens": 5920881076.0, + "step": 11583 + }, + { + "epoch": 3.1325040562466198, + "grad_norm": 3.21875, + "learning_rate": 0.00782171290882416, + "loss": 2.737, + "mean_token_accuracy": 0.46770498156547546, + "num_tokens": 5921405300.0, + "step": 11584 + }, + { + "epoch": 3.1327744726879394, + "grad_norm": 2.921875, + "learning_rate": 0.007820238042606151, + "loss": 2.7866, + "mean_token_accuracy": 0.4260774254798889, + "num_tokens": 5921929487.0, + "step": 11585 + }, + { + "epoch": 3.133044889129259, + "grad_norm": 2.796875, + "learning_rate": 0.00781876327395259, + "loss": 3.0226, + "mean_token_accuracy": 0.4128448963165283, + "num_tokens": 5922431851.0, + "step": 11586 + }, + { + "epoch": 3.1333153055705787, + "grad_norm": 2.9375, + "learning_rate": 0.007817288602908727, + "loss": 2.9104, + "mean_token_accuracy": 0.4242617189884186, + "num_tokens": 5922902291.0, + "step": 11587 + }, + { + "epoch": 3.1335857220118983, + "grad_norm": 3.25, + "learning_rate": 0.007815814029519813, + "loss": 2.9289, + "mean_token_accuracy": 0.42211586236953735, + "num_tokens": 5923426449.0, + "step": 11588 + }, + { + "epoch": 3.133856138453218, + "grad_norm": 2.8125, + "learning_rate": 0.007814339553831082, + "loss": 2.9452, + "mean_token_accuracy": 0.4179654121398926, + "num_tokens": 5923950679.0, + "step": 11589 + }, + { + "epoch": 3.1341265548945376, + "grad_norm": 3.4375, + "learning_rate": 0.007812865175887787, + "loss": 2.956, + "mean_token_accuracy": 0.42584484815597534, + "num_tokens": 5924448426.0, + "step": 11590 + }, + { + "epoch": 3.1343969713358573, + "grad_norm": 40.25, + "learning_rate": 0.007811390895735162, + "loss": 12.0596, + "mean_token_accuracy": 0.012991427443921566, + "num_tokens": 5924964846.0, + "step": 11591 + }, + { + "epoch": 3.134667387777177, + "grad_norm": 6.625, + "learning_rate": 0.007809916713418436, + "loss": 3.161, + "mean_token_accuracy": 0.37183430790901184, + "num_tokens": 5925489032.0, + "step": 11592 + }, + { + "epoch": 3.1349378042184965, + "grad_norm": 2.171875, + "learning_rate": 0.007808442628982851, + "loss": 2.9537, + "mean_token_accuracy": 0.4148816466331482, + "num_tokens": 5926003621.0, + "step": 11593 + }, + { + "epoch": 3.135208220659816, + "grad_norm": 2.40625, + "learning_rate": 0.007806968642473628, + "loss": 3.0546, + "mean_token_accuracy": 0.40046435594558716, + "num_tokens": 5926483020.0, + "step": 11594 + }, + { + "epoch": 3.135478637101136, + "grad_norm": 2.75, + "learning_rate": 0.007805494753936, + "loss": 2.8848, + "mean_token_accuracy": 0.4197925329208374, + "num_tokens": 5926989537.0, + "step": 11595 + }, + { + "epoch": 3.1357490535424555, + "grad_norm": 3.265625, + "learning_rate": 0.0078040209634151906, + "loss": 3.0175, + "mean_token_accuracy": 0.40633800625801086, + "num_tokens": 5927513770.0, + "step": 11596 + }, + { + "epoch": 3.136019469983775, + "grad_norm": 3.125, + "learning_rate": 0.007802547270956411, + "loss": 2.806, + "mean_token_accuracy": 0.41203489899635315, + "num_tokens": 5928037900.0, + "step": 11597 + }, + { + "epoch": 3.1362898864250948, + "grad_norm": 3.65625, + "learning_rate": 0.007801073676604889, + "loss": 2.8475, + "mean_token_accuracy": 0.3921232223510742, + "num_tokens": 5928561968.0, + "step": 11598 + }, + { + "epoch": 3.1365603028664144, + "grad_norm": 2.359375, + "learning_rate": 0.007799600180405833, + "loss": 3.2511, + "mean_token_accuracy": 0.41045576333999634, + "num_tokens": 5929042531.0, + "step": 11599 + }, + { + "epoch": 3.136830719307734, + "grad_norm": 2.703125, + "learning_rate": 0.007798126782404449, + "loss": 2.9573, + "mean_token_accuracy": 0.40066638588905334, + "num_tokens": 5929566683.0, + "step": 11600 + }, + { + "epoch": 3.1371011357490537, + "grad_norm": 3.140625, + "learning_rate": 0.007796653482645957, + "loss": 2.9206, + "mean_token_accuracy": 0.45637014508247375, + "num_tokens": 5930090886.0, + "step": 11601 + }, + { + "epoch": 3.1373715521903733, + "grad_norm": 2.859375, + "learning_rate": 0.007795180281175558, + "loss": 2.941, + "mean_token_accuracy": 0.41988253593444824, + "num_tokens": 5930565173.0, + "step": 11602 + }, + { + "epoch": 3.137641968631693, + "grad_norm": 3.546875, + "learning_rate": 0.007793707178038447, + "loss": 2.8566, + "mean_token_accuracy": 0.4211505055427551, + "num_tokens": 5931089417.0, + "step": 11603 + }, + { + "epoch": 3.1379123850730126, + "grad_norm": 3.078125, + "learning_rate": 0.007792234173279835, + "loss": 3.0365, + "mean_token_accuracy": 0.4124678671360016, + "num_tokens": 5931556026.0, + "step": 11604 + }, + { + "epoch": 3.1381828015143323, + "grad_norm": 3.21875, + "learning_rate": 0.00779076126694491, + "loss": 2.9732, + "mean_token_accuracy": 0.4026917815208435, + "num_tokens": 5932080189.0, + "step": 11605 + }, + { + "epoch": 3.138453217955652, + "grad_norm": 3.40625, + "learning_rate": 0.0077892884590788644, + "loss": 3.1356, + "mean_token_accuracy": 0.404022216796875, + "num_tokens": 5932604330.0, + "step": 11606 + }, + { + "epoch": 3.138723634396971, + "grad_norm": 3.765625, + "learning_rate": 0.007787815749726894, + "loss": 3.0238, + "mean_token_accuracy": 0.41573619842529297, + "num_tokens": 5933128580.0, + "step": 11607 + }, + { + "epoch": 3.138994050838291, + "grad_norm": 3.375, + "learning_rate": 0.007786343138934182, + "loss": 2.8935, + "mean_token_accuracy": 0.41786277294158936, + "num_tokens": 5933652864.0, + "step": 11608 + }, + { + "epoch": 3.1392644672796104, + "grad_norm": 3.390625, + "learning_rate": 0.007784870626745912, + "loss": 3.0268, + "mean_token_accuracy": 0.4122176766395569, + "num_tokens": 5934176970.0, + "step": 11609 + }, + { + "epoch": 3.13953488372093, + "grad_norm": 3.125, + "learning_rate": 0.007783398213207269, + "loss": 3.0007, + "mean_token_accuracy": 0.4095906913280487, + "num_tokens": 5934701002.0, + "step": 11610 + }, + { + "epoch": 3.1398053001622497, + "grad_norm": 14.375, + "learning_rate": 0.0077819258983634265, + "loss": 12.9534, + "mean_token_accuracy": 0.00021462663426063955, + "num_tokens": 5935225270.0, + "step": 11611 + }, + { + "epoch": 3.1400757166035693, + "grad_norm": 7.65625, + "learning_rate": 0.007780453682259561, + "loss": 3.2024, + "mean_token_accuracy": 0.37873905897140503, + "num_tokens": 5935749519.0, + "step": 11612 + }, + { + "epoch": 3.140346133044889, + "grad_norm": 1.9921875, + "learning_rate": 0.007778981564940846, + "loss": 2.9154, + "mean_token_accuracy": 0.4142633080482483, + "num_tokens": 5936255699.0, + "step": 11613 + }, + { + "epoch": 3.1406165494862086, + "grad_norm": 3.8125, + "learning_rate": 0.007777509546452446, + "loss": 3.0825, + "mean_token_accuracy": 0.4072338938713074, + "num_tokens": 5936779957.0, + "step": 11614 + }, + { + "epoch": 3.1408869659275283, + "grad_norm": 3.140625, + "learning_rate": 0.007776037626839532, + "loss": 2.7387, + "mean_token_accuracy": 0.42189115285873413, + "num_tokens": 5937304155.0, + "step": 11615 + }, + { + "epoch": 3.141157382368848, + "grad_norm": 3.0, + "learning_rate": 0.007774565806147265, + "loss": 2.9924, + "mean_token_accuracy": 0.41711264848709106, + "num_tokens": 5937828370.0, + "step": 11616 + }, + { + "epoch": 3.1414277988101675, + "grad_norm": 2.75, + "learning_rate": 0.007773094084420802, + "loss": 2.9113, + "mean_token_accuracy": 0.4293298125267029, + "num_tokens": 5938352619.0, + "step": 11617 + }, + { + "epoch": 3.141698215251487, + "grad_norm": 2.8125, + "learning_rate": 0.007771622461705303, + "loss": 2.7331, + "mean_token_accuracy": 0.4443334937095642, + "num_tokens": 5938816311.0, + "step": 11618 + }, + { + "epoch": 3.141968631692807, + "grad_norm": 2.53125, + "learning_rate": 0.007770150938045926, + "loss": 2.9463, + "mean_token_accuracy": 0.4163840115070343, + "num_tokens": 5939340432.0, + "step": 11619 + }, + { + "epoch": 3.1422390481341265, + "grad_norm": 3.640625, + "learning_rate": 0.007768679513487811, + "loss": 3.1531, + "mean_token_accuracy": 0.4070577025413513, + "num_tokens": 5939864683.0, + "step": 11620 + }, + { + "epoch": 3.142509464575446, + "grad_norm": 2.765625, + "learning_rate": 0.007767208188076111, + "loss": 2.7261, + "mean_token_accuracy": 0.4446759819984436, + "num_tokens": 5940388919.0, + "step": 11621 + }, + { + "epoch": 3.1427798810167658, + "grad_norm": 2.546875, + "learning_rate": 0.007765736961855974, + "loss": 2.8933, + "mean_token_accuracy": 0.3979998230934143, + "num_tokens": 5940913196.0, + "step": 11622 + }, + { + "epoch": 3.1430502974580854, + "grad_norm": 2.890625, + "learning_rate": 0.007764265834872535, + "loss": 2.981, + "mean_token_accuracy": 0.42215853929519653, + "num_tokens": 5941437450.0, + "step": 11623 + }, + { + "epoch": 3.143320713899405, + "grad_norm": 2.90625, + "learning_rate": 0.0077627948071709385, + "loss": 3.103, + "mean_token_accuracy": 0.41459688544273376, + "num_tokens": 5941955945.0, + "step": 11624 + }, + { + "epoch": 3.1435911303407247, + "grad_norm": 3.546875, + "learning_rate": 0.007761323878796318, + "loss": 3.017, + "mean_token_accuracy": 0.43297767639160156, + "num_tokens": 5942462636.0, + "step": 11625 + }, + { + "epoch": 3.1438615467820443, + "grad_norm": 3.34375, + "learning_rate": 0.007759853049793802, + "loss": 3.0412, + "mean_token_accuracy": 0.4179273247718811, + "num_tokens": 5942986913.0, + "step": 11626 + }, + { + "epoch": 3.144131963223364, + "grad_norm": 4.25, + "learning_rate": 0.007758382320208526, + "loss": 3.0244, + "mean_token_accuracy": 0.40671300888061523, + "num_tokens": 5943511196.0, + "step": 11627 + }, + { + "epoch": 3.1444023796646836, + "grad_norm": 2.796875, + "learning_rate": 0.007756911690085613, + "loss": 3.0195, + "mean_token_accuracy": 0.43110787868499756, + "num_tokens": 5943985074.0, + "step": 11628 + }, + { + "epoch": 3.1446727961060033, + "grad_norm": 3.515625, + "learning_rate": 0.0077554411594701845, + "loss": 3.0162, + "mean_token_accuracy": 0.404792845249176, + "num_tokens": 5944509252.0, + "step": 11629 + }, + { + "epoch": 3.144943212547323, + "grad_norm": 2.984375, + "learning_rate": 0.007753970728407365, + "loss": 2.8145, + "mean_token_accuracy": 0.4254955053329468, + "num_tokens": 5945033522.0, + "step": 11630 + }, + { + "epoch": 3.1452136289886425, + "grad_norm": 32.5, + "learning_rate": 0.0077525003969422706, + "loss": 10.1587, + "mean_token_accuracy": 0.01500628050416708, + "num_tokens": 5945557691.0, + "step": 11631 + }, + { + "epoch": 3.145484045429962, + "grad_norm": 9.25, + "learning_rate": 0.00775103016512001, + "loss": 3.2226, + "mean_token_accuracy": 0.4024221897125244, + "num_tokens": 5946029244.0, + "step": 11632 + }, + { + "epoch": 3.145754461871282, + "grad_norm": 2.5, + "learning_rate": 0.007749560032985703, + "loss": 2.9783, + "mean_token_accuracy": 0.41931232810020447, + "num_tokens": 5946500228.0, + "step": 11633 + }, + { + "epoch": 3.1460248783126015, + "grad_norm": 2.5, + "learning_rate": 0.007748090000584451, + "loss": 3.1265, + "mean_token_accuracy": 0.41433796286582947, + "num_tokens": 5947024166.0, + "step": 11634 + }, + { + "epoch": 3.146295294753921, + "grad_norm": 1.8984375, + "learning_rate": 0.0077466200679613606, + "loss": 2.9638, + "mean_token_accuracy": 0.4113888144493103, + "num_tokens": 5947548380.0, + "step": 11635 + }, + { + "epoch": 3.1465657111952408, + "grad_norm": 3.09375, + "learning_rate": 0.007745150235161534, + "loss": 3.1133, + "mean_token_accuracy": 0.41438886523246765, + "num_tokens": 5948072586.0, + "step": 11636 + }, + { + "epoch": 3.1468361276365604, + "grad_norm": 2.625, + "learning_rate": 0.007743680502230073, + "loss": 2.8802, + "mean_token_accuracy": 0.42049020528793335, + "num_tokens": 5948568439.0, + "step": 11637 + }, + { + "epoch": 3.14710654407788, + "grad_norm": 2.359375, + "learning_rate": 0.007742210869212065, + "loss": 2.9996, + "mean_token_accuracy": 0.40040263533592224, + "num_tokens": 5949092585.0, + "step": 11638 + }, + { + "epoch": 3.1473769605191997, + "grad_norm": 2.90625, + "learning_rate": 0.0077407413361526125, + "loss": 3.0373, + "mean_token_accuracy": 0.4243001937866211, + "num_tokens": 5949611140.0, + "step": 11639 + }, + { + "epoch": 3.1476473769605193, + "grad_norm": 2.765625, + "learning_rate": 0.0077392719030967986, + "loss": 2.9916, + "mean_token_accuracy": 0.42169326543807983, + "num_tokens": 5950122709.0, + "step": 11640 + }, + { + "epoch": 3.147917793401839, + "grad_norm": 3.328125, + "learning_rate": 0.0077378025700897135, + "loss": 2.915, + "mean_token_accuracy": 0.3768083453178406, + "num_tokens": 5950646933.0, + "step": 11641 + }, + { + "epoch": 3.1481882098431586, + "grad_norm": 2.515625, + "learning_rate": 0.00773633333717644, + "loss": 2.9209, + "mean_token_accuracy": 0.43139123916625977, + "num_tokens": 5951171167.0, + "step": 11642 + }, + { + "epoch": 3.1484586262844783, + "grad_norm": 4.21875, + "learning_rate": 0.007734864204402053, + "loss": 2.8177, + "mean_token_accuracy": 0.4327388107776642, + "num_tokens": 5951663142.0, + "step": 11643 + }, + { + "epoch": 3.148729042725798, + "grad_norm": 2.421875, + "learning_rate": 0.007733395171811637, + "loss": 2.7191, + "mean_token_accuracy": 0.425087034702301, + "num_tokens": 5952187248.0, + "step": 11644 + }, + { + "epoch": 3.1489994591671175, + "grad_norm": 3.09375, + "learning_rate": 0.007731926239450264, + "loss": 3.0949, + "mean_token_accuracy": 0.3989174962043762, + "num_tokens": 5952711522.0, + "step": 11645 + }, + { + "epoch": 3.149269875608437, + "grad_norm": 3.125, + "learning_rate": 0.007730457407363003, + "loss": 2.8524, + "mean_token_accuracy": 0.4361233711242676, + "num_tokens": 5953235796.0, + "step": 11646 + }, + { + "epoch": 3.149540292049757, + "grad_norm": 3.515625, + "learning_rate": 0.0077289886755949255, + "loss": 2.7946, + "mean_token_accuracy": 0.4228421449661255, + "num_tokens": 5953759998.0, + "step": 11647 + }, + { + "epoch": 3.149810708491076, + "grad_norm": 2.609375, + "learning_rate": 0.007727520044191097, + "loss": 2.827, + "mean_token_accuracy": 0.44034045934677124, + "num_tokens": 5954284093.0, + "step": 11648 + }, + { + "epoch": 3.150081124932396, + "grad_norm": 3.328125, + "learning_rate": 0.007726051513196572, + "loss": 2.9028, + "mean_token_accuracy": 0.3976791203022003, + "num_tokens": 5954808214.0, + "step": 11649 + }, + { + "epoch": 3.1503515413737153, + "grad_norm": 2.9375, + "learning_rate": 0.0077245830826564174, + "loss": 3.1374, + "mean_token_accuracy": 0.40021511912345886, + "num_tokens": 5955332469.0, + "step": 11650 + }, + { + "epoch": 3.150621957815035, + "grad_norm": 8.9375, + "learning_rate": 0.00772311475261569, + "loss": 9.0988, + "mean_token_accuracy": 0.0023897013161331415, + "num_tokens": 5955798291.0, + "step": 11651 + }, + { + "epoch": 3.1508923742563546, + "grad_norm": 7.875, + "learning_rate": 0.00772164652311943, + "loss": 3.2276, + "mean_token_accuracy": 0.3557501435279846, + "num_tokens": 5956322456.0, + "step": 11652 + }, + { + "epoch": 3.1511627906976742, + "grad_norm": 2.46875, + "learning_rate": 0.007720178394212703, + "loss": 2.9717, + "mean_token_accuracy": 0.40320873260498047, + "num_tokens": 5956846562.0, + "step": 11653 + }, + { + "epoch": 3.151433207138994, + "grad_norm": 3.0625, + "learning_rate": 0.007718710365940549, + "loss": 2.9847, + "mean_token_accuracy": 0.43504685163497925, + "num_tokens": 5957370525.0, + "step": 11654 + }, + { + "epoch": 3.1517036235803135, + "grad_norm": 3.25, + "learning_rate": 0.007717242438348005, + "loss": 3.0926, + "mean_token_accuracy": 0.419863224029541, + "num_tokens": 5957858332.0, + "step": 11655 + }, + { + "epoch": 3.151974040021633, + "grad_norm": 3.8125, + "learning_rate": 0.00771577461148012, + "loss": 2.9265, + "mean_token_accuracy": 0.4711063802242279, + "num_tokens": 5958313620.0, + "step": 11656 + }, + { + "epoch": 3.152244456462953, + "grad_norm": 3.03125, + "learning_rate": 0.007714306885381927, + "loss": 3.0511, + "mean_token_accuracy": 0.39823585748672485, + "num_tokens": 5958837848.0, + "step": 11657 + }, + { + "epoch": 3.1525148729042725, + "grad_norm": 2.9375, + "learning_rate": 0.007712839260098461, + "loss": 2.94, + "mean_token_accuracy": 0.4219234585762024, + "num_tokens": 5959310501.0, + "step": 11658 + }, + { + "epoch": 3.152785289345592, + "grad_norm": 2.9375, + "learning_rate": 0.0077113717356747545, + "loss": 2.8343, + "mean_token_accuracy": 0.44966405630111694, + "num_tokens": 5959770855.0, + "step": 11659 + }, + { + "epoch": 3.1530557057869117, + "grad_norm": 3.21875, + "learning_rate": 0.007709904312155835, + "loss": 2.9781, + "mean_token_accuracy": 0.40711838006973267, + "num_tokens": 5960274527.0, + "step": 11660 + }, + { + "epoch": 3.1533261222282314, + "grad_norm": 4.0625, + "learning_rate": 0.007708436989586725, + "loss": 2.9339, + "mean_token_accuracy": 0.40776526927948, + "num_tokens": 5960798679.0, + "step": 11661 + }, + { + "epoch": 3.153596538669551, + "grad_norm": 3.625, + "learning_rate": 0.007706969768012451, + "loss": 3.0423, + "mean_token_accuracy": 0.41982218623161316, + "num_tokens": 5961222500.0, + "step": 11662 + }, + { + "epoch": 3.1538669551108707, + "grad_norm": 2.828125, + "learning_rate": 0.007705502647478025, + "loss": 2.8049, + "mean_token_accuracy": 0.441329687833786, + "num_tokens": 5961702934.0, + "step": 11663 + }, + { + "epoch": 3.1541373715521903, + "grad_norm": 3.765625, + "learning_rate": 0.007704035628028469, + "loss": 2.9659, + "mean_token_accuracy": 0.42215073108673096, + "num_tokens": 5962227154.0, + "step": 11664 + }, + { + "epoch": 3.15440778799351, + "grad_norm": 2.703125, + "learning_rate": 0.007702568709708791, + "loss": 2.9211, + "mean_token_accuracy": 0.42773568630218506, + "num_tokens": 5962751296.0, + "step": 11665 + }, + { + "epoch": 3.1546782044348296, + "grad_norm": 3.296875, + "learning_rate": 0.007701101892564002, + "loss": 2.8643, + "mean_token_accuracy": 0.4265686273574829, + "num_tokens": 5963275448.0, + "step": 11666 + }, + { + "epoch": 3.1549486208761492, + "grad_norm": 2.828125, + "learning_rate": 0.0076996351766391085, + "loss": 3.1024, + "mean_token_accuracy": 0.4184211492538452, + "num_tokens": 5963791722.0, + "step": 11667 + }, + { + "epoch": 3.155219037317469, + "grad_norm": 3.3125, + "learning_rate": 0.007698168561979118, + "loss": 3.1032, + "mean_token_accuracy": 0.4155614972114563, + "num_tokens": 5964315936.0, + "step": 11668 + }, + { + "epoch": 3.1554894537587885, + "grad_norm": 3.015625, + "learning_rate": 0.007696702048629018, + "loss": 3.1318, + "mean_token_accuracy": 0.4103614091873169, + "num_tokens": 5964840174.0, + "step": 11669 + }, + { + "epoch": 3.155759870200108, + "grad_norm": 3.8125, + "learning_rate": 0.007695235636633822, + "loss": 3.0714, + "mean_token_accuracy": 0.4164986312389374, + "num_tokens": 5965364413.0, + "step": 11670 + }, + { + "epoch": 3.156030286641428, + "grad_norm": 8.125, + "learning_rate": 0.007693769326038509, + "loss": 14.3934, + "mean_token_accuracy": 0.008823378942906857, + "num_tokens": 5965888671.0, + "step": 11671 + }, + { + "epoch": 3.1563007030827475, + "grad_norm": 5.90625, + "learning_rate": 0.0076923031168880775, + "loss": 3.0996, + "mean_token_accuracy": 0.405739426612854, + "num_tokens": 5966412821.0, + "step": 11672 + }, + { + "epoch": 3.156571119524067, + "grad_norm": 2.828125, + "learning_rate": 0.0076908370092275136, + "loss": 3.1383, + "mean_token_accuracy": 0.3958320617675781, + "num_tokens": 5966937003.0, + "step": 11673 + }, + { + "epoch": 3.1568415359653867, + "grad_norm": 2.484375, + "learning_rate": 0.007689371003101806, + "loss": 3.1079, + "mean_token_accuracy": 0.40103572607040405, + "num_tokens": 5967461283.0, + "step": 11674 + }, + { + "epoch": 3.1571119524067064, + "grad_norm": 3.21875, + "learning_rate": 0.007687905098555927, + "loss": 3.0132, + "mean_token_accuracy": 0.42053523659706116, + "num_tokens": 5967964621.0, + "step": 11675 + }, + { + "epoch": 3.157382368848026, + "grad_norm": 2.484375, + "learning_rate": 0.007686439295634862, + "loss": 2.9549, + "mean_token_accuracy": 0.4161791503429413, + "num_tokens": 5968488861.0, + "step": 11676 + }, + { + "epoch": 3.1576527852893457, + "grad_norm": 3.328125, + "learning_rate": 0.007684973594383584, + "loss": 3.0114, + "mean_token_accuracy": 0.40082186460494995, + "num_tokens": 5969013020.0, + "step": 11677 + }, + { + "epoch": 3.1579232017306653, + "grad_norm": 3.21875, + "learning_rate": 0.007683507994847063, + "loss": 3.1772, + "mean_token_accuracy": 0.40705621242523193, + "num_tokens": 5969537249.0, + "step": 11678 + }, + { + "epoch": 3.158193618171985, + "grad_norm": 3.34375, + "learning_rate": 0.007682042497070272, + "loss": 2.8666, + "mean_token_accuracy": 0.4425302743911743, + "num_tokens": 5970022311.0, + "step": 11679 + }, + { + "epoch": 3.1584640346133046, + "grad_norm": 2.75, + "learning_rate": 0.007680577101098174, + "loss": 2.9283, + "mean_token_accuracy": 0.42228761315345764, + "num_tokens": 5970546446.0, + "step": 11680 + }, + { + "epoch": 3.1587344510546242, + "grad_norm": 3.25, + "learning_rate": 0.007679111806975729, + "loss": 3.016, + "mean_token_accuracy": 0.4076537489891052, + "num_tokens": 5971070715.0, + "step": 11681 + }, + { + "epoch": 3.159004867495944, + "grad_norm": 2.953125, + "learning_rate": 0.007677646614747904, + "loss": 3.0197, + "mean_token_accuracy": 0.4210643470287323, + "num_tokens": 5971594995.0, + "step": 11682 + }, + { + "epoch": 3.1592752839372635, + "grad_norm": 3.21875, + "learning_rate": 0.0076761815244596535, + "loss": 3.0056, + "mean_token_accuracy": 0.4198797643184662, + "num_tokens": 5972119251.0, + "step": 11683 + }, + { + "epoch": 3.159545700378583, + "grad_norm": 2.703125, + "learning_rate": 0.007674716536155923, + "loss": 3.0367, + "mean_token_accuracy": 0.40825366973876953, + "num_tokens": 5972643502.0, + "step": 11684 + }, + { + "epoch": 3.159816116819903, + "grad_norm": 3.0625, + "learning_rate": 0.007673251649881669, + "loss": 2.9868, + "mean_token_accuracy": 0.4381057620048523, + "num_tokens": 5973167669.0, + "step": 11685 + }, + { + "epoch": 3.1600865332612225, + "grad_norm": 2.765625, + "learning_rate": 0.00767178686568184, + "loss": 3.0105, + "mean_token_accuracy": 0.40315738320350647, + "num_tokens": 5973691913.0, + "step": 11686 + }, + { + "epoch": 3.160356949702542, + "grad_norm": 3.28125, + "learning_rate": 0.007670322183601377, + "loss": 3.0494, + "mean_token_accuracy": 0.4294660687446594, + "num_tokens": 5974216115.0, + "step": 11687 + }, + { + "epoch": 3.1606273661438617, + "grad_norm": 3.109375, + "learning_rate": 0.0076688576036852245, + "loss": 2.8897, + "mean_token_accuracy": 0.41118496656417847, + "num_tokens": 5974706371.0, + "step": 11688 + }, + { + "epoch": 3.160897782585181, + "grad_norm": 3.640625, + "learning_rate": 0.00766739312597831, + "loss": 2.7529, + "mean_token_accuracy": 0.40031009912490845, + "num_tokens": 5975230604.0, + "step": 11689 + }, + { + "epoch": 3.161168199026501, + "grad_norm": 2.703125, + "learning_rate": 0.007665928750525579, + "loss": 2.9494, + "mean_token_accuracy": 0.40250301361083984, + "num_tokens": 5975754831.0, + "step": 11690 + }, + { + "epoch": 3.1614386154678202, + "grad_norm": 4.625, + "learning_rate": 0.00766446447737196, + "loss": 10.8072, + "mean_token_accuracy": 0.009225880727171898, + "num_tokens": 5976279075.0, + "step": 11691 + }, + { + "epoch": 3.16170903190914, + "grad_norm": 6.71875, + "learning_rate": 0.007663000306562378, + "loss": 2.9385, + "mean_token_accuracy": 0.3945237994194031, + "num_tokens": 5976803332.0, + "step": 11692 + }, + { + "epoch": 3.1619794483504595, + "grad_norm": 2.78125, + "learning_rate": 0.00766153623814176, + "loss": 3.1885, + "mean_token_accuracy": 0.38205045461654663, + "num_tokens": 5977327431.0, + "step": 11693 + }, + { + "epoch": 3.162249864791779, + "grad_norm": 2.671875, + "learning_rate": 0.00766007227215503, + "loss": 3.1524, + "mean_token_accuracy": 0.42008349299430847, + "num_tokens": 5977814358.0, + "step": 11694 + }, + { + "epoch": 3.162520281233099, + "grad_norm": 2.78125, + "learning_rate": 0.0076586084086471004, + "loss": 2.9896, + "mean_token_accuracy": 0.4252038896083832, + "num_tokens": 5978338558.0, + "step": 11695 + }, + { + "epoch": 3.1627906976744184, + "grad_norm": 2.875, + "learning_rate": 0.007657144647662895, + "loss": 2.9331, + "mean_token_accuracy": 0.4190107583999634, + "num_tokens": 5978862678.0, + "step": 11696 + }, + { + "epoch": 3.163061114115738, + "grad_norm": 2.734375, + "learning_rate": 0.0076556809892473245, + "loss": 2.9892, + "mean_token_accuracy": 0.4268344044685364, + "num_tokens": 5979386927.0, + "step": 11697 + }, + { + "epoch": 3.1633315305570577, + "grad_norm": 2.6875, + "learning_rate": 0.007654217433445292, + "loss": 3.0216, + "mean_token_accuracy": 0.4119413495063782, + "num_tokens": 5979911122.0, + "step": 11698 + }, + { + "epoch": 3.1636019469983774, + "grad_norm": 3.328125, + "learning_rate": 0.00765275398030171, + "loss": 2.9871, + "mean_token_accuracy": 0.4192858636379242, + "num_tokens": 5980435282.0, + "step": 11699 + }, + { + "epoch": 3.163872363439697, + "grad_norm": 3.34375, + "learning_rate": 0.007651290629861478, + "loss": 3.0894, + "mean_token_accuracy": 0.4008106291294098, + "num_tokens": 5980959563.0, + "step": 11700 + }, + { + "epoch": 3.1641427798810167, + "grad_norm": 2.984375, + "learning_rate": 0.007649827382169496, + "loss": 3.061, + "mean_token_accuracy": 0.3925931751728058, + "num_tokens": 5981483569.0, + "step": 11701 + }, + { + "epoch": 3.1644131963223363, + "grad_norm": 2.875, + "learning_rate": 0.007648364237270666, + "loss": 2.8944, + "mean_token_accuracy": 0.415794312953949, + "num_tokens": 5982007799.0, + "step": 11702 + }, + { + "epoch": 3.164683612763656, + "grad_norm": 3.578125, + "learning_rate": 0.007646901195209876, + "loss": 2.9496, + "mean_token_accuracy": 0.41461989283561707, + "num_tokens": 5982471146.0, + "step": 11703 + }, + { + "epoch": 3.1649540292049756, + "grad_norm": 2.984375, + "learning_rate": 0.007645438256032018, + "loss": 3.0187, + "mean_token_accuracy": 0.405448317527771, + "num_tokens": 5982995411.0, + "step": 11704 + }, + { + "epoch": 3.1652244456462952, + "grad_norm": 3.28125, + "learning_rate": 0.007643975419781981, + "loss": 2.9688, + "mean_token_accuracy": 0.4151821732521057, + "num_tokens": 5983519629.0, + "step": 11705 + }, + { + "epoch": 3.165494862087615, + "grad_norm": 2.890625, + "learning_rate": 0.007642512686504645, + "loss": 2.9121, + "mean_token_accuracy": 0.4436008632183075, + "num_tokens": 5984043771.0, + "step": 11706 + }, + { + "epoch": 3.1657652785289345, + "grad_norm": 2.4375, + "learning_rate": 0.007641050056244896, + "loss": 2.9237, + "mean_token_accuracy": 0.410244882106781, + "num_tokens": 5984568044.0, + "step": 11707 + }, + { + "epoch": 3.166035694970254, + "grad_norm": 2.609375, + "learning_rate": 0.007639587529047607, + "loss": 2.765, + "mean_token_accuracy": 0.4439569115638733, + "num_tokens": 5985039633.0, + "step": 11708 + }, + { + "epoch": 3.166306111411574, + "grad_norm": 3.25, + "learning_rate": 0.007638125104957654, + "loss": 2.9891, + "mean_token_accuracy": 0.4381018280982971, + "num_tokens": 5985505644.0, + "step": 11709 + }, + { + "epoch": 3.1665765278528935, + "grad_norm": 3.34375, + "learning_rate": 0.007636662784019912, + "loss": 2.9655, + "mean_token_accuracy": 0.41856062412261963, + "num_tokens": 5986029807.0, + "step": 11710 + }, + { + "epoch": 3.166846944294213, + "grad_norm": 147.0, + "learning_rate": 0.007635200566279248, + "loss": 20.7668, + "mean_token_accuracy": 0.0, + "num_tokens": 5986553957.0, + "step": 11711 + }, + { + "epoch": 3.1671173607355327, + "grad_norm": 8.1875, + "learning_rate": 0.007633738451780521, + "loss": 3.0055, + "mean_token_accuracy": 0.4307247996330261, + "num_tokens": 5987078235.0, + "step": 11712 + }, + { + "epoch": 3.1673877771768524, + "grad_norm": 2.21875, + "learning_rate": 0.007632276440568602, + "loss": 2.9953, + "mean_token_accuracy": 0.401561975479126, + "num_tokens": 5987602413.0, + "step": 11713 + }, + { + "epoch": 3.167658193618172, + "grad_norm": 2.640625, + "learning_rate": 0.007630814532688346, + "loss": 3.0357, + "mean_token_accuracy": 0.4222941994667053, + "num_tokens": 5988126569.0, + "step": 11714 + }, + { + "epoch": 3.1679286100594917, + "grad_norm": 3.984375, + "learning_rate": 0.0076293527281846045, + "loss": 3.0153, + "mean_token_accuracy": 0.3978385329246521, + "num_tokens": 5988650803.0, + "step": 11715 + }, + { + "epoch": 3.1681990265008113, + "grad_norm": 3.234375, + "learning_rate": 0.007627891027102236, + "loss": 3.0585, + "mean_token_accuracy": 0.40125149488449097, + "num_tokens": 5989174991.0, + "step": 11716 + }, + { + "epoch": 3.168469442942131, + "grad_norm": 3.453125, + "learning_rate": 0.0076264294294860905, + "loss": 2.8623, + "mean_token_accuracy": 0.41456907987594604, + "num_tokens": 5989699253.0, + "step": 11717 + }, + { + "epoch": 3.1687398593834506, + "grad_norm": 3.203125, + "learning_rate": 0.007624967935381004, + "loss": 3.1064, + "mean_token_accuracy": 0.4001633822917938, + "num_tokens": 5990223414.0, + "step": 11718 + }, + { + "epoch": 3.1690102758247702, + "grad_norm": 3.46875, + "learning_rate": 0.007623506544831833, + "loss": 3.2448, + "mean_token_accuracy": 0.38995206356048584, + "num_tokens": 5990747674.0, + "step": 11719 + }, + { + "epoch": 3.16928069226609, + "grad_norm": 2.984375, + "learning_rate": 0.007622045257883407, + "loss": 3.0504, + "mean_token_accuracy": 0.39788269996643066, + "num_tokens": 5991271720.0, + "step": 11720 + }, + { + "epoch": 3.1695511087074095, + "grad_norm": 2.71875, + "learning_rate": 0.007620584074580568, + "loss": 3.0077, + "mean_token_accuracy": 0.4224494993686676, + "num_tokens": 5991781944.0, + "step": 11721 + }, + { + "epoch": 3.169821525148729, + "grad_norm": 3.0, + "learning_rate": 0.0076191229949681485, + "loss": 3.042, + "mean_token_accuracy": 0.41310805082321167, + "num_tokens": 5992306170.0, + "step": 11722 + }, + { + "epoch": 3.170091941590049, + "grad_norm": 3.203125, + "learning_rate": 0.007617662019090979, + "loss": 3.092, + "mean_token_accuracy": 0.415000855922699, + "num_tokens": 5992779925.0, + "step": 11723 + }, + { + "epoch": 3.1703623580313685, + "grad_norm": 3.109375, + "learning_rate": 0.007616201146993879, + "loss": 2.9321, + "mean_token_accuracy": 0.4348609149456024, + "num_tokens": 5993304179.0, + "step": 11724 + }, + { + "epoch": 3.170632774472688, + "grad_norm": 2.796875, + "learning_rate": 0.007614740378721688, + "loss": 3.1604, + "mean_token_accuracy": 0.39853402972221375, + "num_tokens": 5993828389.0, + "step": 11725 + }, + { + "epoch": 3.1709031909140077, + "grad_norm": 2.8125, + "learning_rate": 0.007613279714319215, + "loss": 2.9751, + "mean_token_accuracy": 0.420248806476593, + "num_tokens": 5994352628.0, + "step": 11726 + }, + { + "epoch": 3.1711736073553274, + "grad_norm": 5.59375, + "learning_rate": 0.007611819153831274, + "loss": 2.8817, + "mean_token_accuracy": 0.41692763566970825, + "num_tokens": 5994876897.0, + "step": 11727 + }, + { + "epoch": 3.171444023796647, + "grad_norm": 2.046875, + "learning_rate": 0.007610358697302691, + "loss": 3.0335, + "mean_token_accuracy": 0.4118424654006958, + "num_tokens": 5995387552.0, + "step": 11728 + }, + { + "epoch": 3.1717144402379667, + "grad_norm": 3.6875, + "learning_rate": 0.00760889834477827, + "loss": 2.9565, + "mean_token_accuracy": 0.42307955026626587, + "num_tokens": 5995911739.0, + "step": 11729 + }, + { + "epoch": 3.171984856679286, + "grad_norm": 2.375, + "learning_rate": 0.007607438096302819, + "loss": 3.0136, + "mean_token_accuracy": 0.4071219861507416, + "num_tokens": 5996436002.0, + "step": 11730 + }, + { + "epoch": 3.172255273120606, + "grad_norm": 12.5, + "learning_rate": 0.007605977951921148, + "loss": 11.5584, + "mean_token_accuracy": 0.0005963763687759638, + "num_tokens": 5996960085.0, + "step": 11731 + }, + { + "epoch": 3.172525689561925, + "grad_norm": 7.4375, + "learning_rate": 0.007604517911678048, + "loss": 3.3907, + "mean_token_accuracy": 0.3660983741283417, + "num_tokens": 5997484356.0, + "step": 11732 + }, + { + "epoch": 3.172796106003245, + "grad_norm": 2.8125, + "learning_rate": 0.007603057975618331, + "loss": 3.1246, + "mean_token_accuracy": 0.3936734199523926, + "num_tokens": 5998008585.0, + "step": 11733 + }, + { + "epoch": 3.1730665224445644, + "grad_norm": 3.171875, + "learning_rate": 0.007601598143786783, + "loss": 3.1149, + "mean_token_accuracy": 0.4134020209312439, + "num_tokens": 5998532688.0, + "step": 11734 + }, + { + "epoch": 3.173336938885884, + "grad_norm": 3.0, + "learning_rate": 0.007600138416228195, + "loss": 2.9268, + "mean_token_accuracy": 0.4328896105289459, + "num_tokens": 5999056959.0, + "step": 11735 + }, + { + "epoch": 3.1736073553272037, + "grad_norm": 3.046875, + "learning_rate": 0.007598678792987359, + "loss": 2.789, + "mean_token_accuracy": 0.442208468914032, + "num_tokens": 5999551985.0, + "step": 11736 + }, + { + "epoch": 3.1738777717685234, + "grad_norm": 2.703125, + "learning_rate": 0.007597219274109064, + "loss": 2.8436, + "mean_token_accuracy": 0.42127513885498047, + "num_tokens": 6000076239.0, + "step": 11737 + }, + { + "epoch": 3.174148188209843, + "grad_norm": 2.640625, + "learning_rate": 0.007595759859638085, + "loss": 2.8878, + "mean_token_accuracy": 0.4237869381904602, + "num_tokens": 6000600251.0, + "step": 11738 + }, + { + "epoch": 3.1744186046511627, + "grad_norm": 2.671875, + "learning_rate": 0.007594300549619209, + "loss": 3.0769, + "mean_token_accuracy": 0.40755555033683777, + "num_tokens": 6001122256.0, + "step": 11739 + }, + { + "epoch": 3.1746890210924823, + "grad_norm": 2.828125, + "learning_rate": 0.007592841344097206, + "loss": 3.2276, + "mean_token_accuracy": 0.40431585907936096, + "num_tokens": 6001646499.0, + "step": 11740 + }, + { + "epoch": 3.174959437533802, + "grad_norm": 2.640625, + "learning_rate": 0.007591382243116847, + "loss": 2.9697, + "mean_token_accuracy": 0.4042096734046936, + "num_tokens": 6002170778.0, + "step": 11741 + }, + { + "epoch": 3.1752298539751216, + "grad_norm": 3.09375, + "learning_rate": 0.00758992324672291, + "loss": 2.9738, + "mean_token_accuracy": 0.42836683988571167, + "num_tokens": 6002695004.0, + "step": 11742 + }, + { + "epoch": 3.1755002704164412, + "grad_norm": 3.625, + "learning_rate": 0.007588464354960152, + "loss": 3.1143, + "mean_token_accuracy": 0.41764745116233826, + "num_tokens": 6003219182.0, + "step": 11743 + }, + { + "epoch": 3.175770686857761, + "grad_norm": 3.09375, + "learning_rate": 0.0075870055678733395, + "loss": 3.0061, + "mean_token_accuracy": 0.40482214093208313, + "num_tokens": 6003743439.0, + "step": 11744 + }, + { + "epoch": 3.1760411032990805, + "grad_norm": 2.78125, + "learning_rate": 0.007585546885507237, + "loss": 2.9846, + "mean_token_accuracy": 0.42130157351493835, + "num_tokens": 6004267614.0, + "step": 11745 + }, + { + "epoch": 3.1763115197404, + "grad_norm": 2.875, + "learning_rate": 0.007584088307906596, + "loss": 2.9277, + "mean_token_accuracy": 0.4275071620941162, + "num_tokens": 6004758927.0, + "step": 11746 + }, + { + "epoch": 3.17658193618172, + "grad_norm": 3.203125, + "learning_rate": 0.00758262983511617, + "loss": 2.9343, + "mean_token_accuracy": 0.4397856891155243, + "num_tokens": 6005269356.0, + "step": 11747 + }, + { + "epoch": 3.1768523526230394, + "grad_norm": 3.59375, + "learning_rate": 0.0075811714671807145, + "loss": 3.0359, + "mean_token_accuracy": 0.41698914766311646, + "num_tokens": 6005793632.0, + "step": 11748 + }, + { + "epoch": 3.177122769064359, + "grad_norm": 3.0625, + "learning_rate": 0.007579713204144967, + "loss": 2.9464, + "mean_token_accuracy": 0.3994767665863037, + "num_tokens": 6006317737.0, + "step": 11749 + }, + { + "epoch": 3.1773931855056787, + "grad_norm": 2.796875, + "learning_rate": 0.007578255046053679, + "loss": 2.9328, + "mean_token_accuracy": 0.40755486488342285, + "num_tokens": 6006825150.0, + "step": 11750 + }, + { + "epoch": 3.1776636019469984, + "grad_norm": 6.21875, + "learning_rate": 0.007576796992951591, + "loss": 10.0922, + "mean_token_accuracy": 0.010721570812165737, + "num_tokens": 6007349345.0, + "step": 11751 + }, + { + "epoch": 3.177934018388318, + "grad_norm": 5.65625, + "learning_rate": 0.0075753390448834385, + "loss": 3.1719, + "mean_token_accuracy": 0.42170214653015137, + "num_tokens": 6007814153.0, + "step": 11752 + }, + { + "epoch": 3.1782044348296377, + "grad_norm": 2.390625, + "learning_rate": 0.0075738812018939505, + "loss": 2.9547, + "mean_token_accuracy": 0.4096983075141907, + "num_tokens": 6008338428.0, + "step": 11753 + }, + { + "epoch": 3.1784748512709573, + "grad_norm": 3.734375, + "learning_rate": 0.0075724234640278665, + "loss": 2.9073, + "mean_token_accuracy": 0.43602877855300903, + "num_tokens": 6008855639.0, + "step": 11754 + }, + { + "epoch": 3.178745267712277, + "grad_norm": 3.109375, + "learning_rate": 0.0075709658313299075, + "loss": 2.7694, + "mean_token_accuracy": 0.4114847183227539, + "num_tokens": 6009379766.0, + "step": 11755 + }, + { + "epoch": 3.1790156841535966, + "grad_norm": 2.734375, + "learning_rate": 0.0075695083038448034, + "loss": 3.1495, + "mean_token_accuracy": 0.39023953676223755, + "num_tokens": 6009866648.0, + "step": 11756 + }, + { + "epoch": 3.1792861005949162, + "grad_norm": 2.78125, + "learning_rate": 0.007568050881617272, + "loss": 3.0291, + "mean_token_accuracy": 0.42009779810905457, + "num_tokens": 6010390812.0, + "step": 11757 + }, + { + "epoch": 3.179556517036236, + "grad_norm": 2.828125, + "learning_rate": 0.00756659356469203, + "loss": 2.9006, + "mean_token_accuracy": 0.43967872858047485, + "num_tokens": 6010915036.0, + "step": 11758 + }, + { + "epoch": 3.1798269334775555, + "grad_norm": 3.46875, + "learning_rate": 0.0075651363531138, + "loss": 2.9198, + "mean_token_accuracy": 0.42205002903938293, + "num_tokens": 6011439182.0, + "step": 11759 + }, + { + "epoch": 3.180097349918875, + "grad_norm": 3.734375, + "learning_rate": 0.007563679246927286, + "loss": 2.9846, + "mean_token_accuracy": 0.4225670099258423, + "num_tokens": 6011963344.0, + "step": 11760 + }, + { + "epoch": 3.180367766360195, + "grad_norm": 3.859375, + "learning_rate": 0.007562222246177195, + "loss": 3.0746, + "mean_token_accuracy": 0.4303411543369293, + "num_tokens": 6012487569.0, + "step": 11761 + }, + { + "epoch": 3.1806381828015144, + "grad_norm": 3.203125, + "learning_rate": 0.0075607653509082405, + "loss": 3.0608, + "mean_token_accuracy": 0.41517168283462524, + "num_tokens": 6013011783.0, + "step": 11762 + }, + { + "epoch": 3.180908599242834, + "grad_norm": 2.953125, + "learning_rate": 0.007559308561165115, + "loss": 2.9211, + "mean_token_accuracy": 0.43822744488716125, + "num_tokens": 6013535870.0, + "step": 11763 + }, + { + "epoch": 3.1811790156841537, + "grad_norm": 2.890625, + "learning_rate": 0.007557851876992523, + "loss": 2.9472, + "mean_token_accuracy": 0.4129963517189026, + "num_tokens": 6014060015.0, + "step": 11764 + }, + { + "epoch": 3.1814494321254734, + "grad_norm": 2.625, + "learning_rate": 0.007556395298435157, + "loss": 2.9566, + "mean_token_accuracy": 0.4210703670978546, + "num_tokens": 6014584270.0, + "step": 11765 + }, + { + "epoch": 3.181719848566793, + "grad_norm": 2.8125, + "learning_rate": 0.007554938825537712, + "loss": 2.9187, + "mean_token_accuracy": 0.41864386200904846, + "num_tokens": 6015108395.0, + "step": 11766 + }, + { + "epoch": 3.1819902650081127, + "grad_norm": 2.703125, + "learning_rate": 0.007553482458344871, + "loss": 2.9996, + "mean_token_accuracy": 0.42237210273742676, + "num_tokens": 6015632593.0, + "step": 11767 + }, + { + "epoch": 3.1822606814494323, + "grad_norm": 3.0625, + "learning_rate": 0.007552026196901329, + "loss": 3.2071, + "mean_token_accuracy": 0.40625715255737305, + "num_tokens": 6016156847.0, + "step": 11768 + }, + { + "epoch": 3.182531097890752, + "grad_norm": 2.625, + "learning_rate": 0.007550570041251763, + "loss": 2.8645, + "mean_token_accuracy": 0.4080314040184021, + "num_tokens": 6016681127.0, + "step": 11769 + }, + { + "epoch": 3.1828015143320716, + "grad_norm": 2.8125, + "learning_rate": 0.007549113991440848, + "loss": 3.028, + "mean_token_accuracy": 0.39991506934165955, + "num_tokens": 6017205310.0, + "step": 11770 + }, + { + "epoch": 3.183071930773391, + "grad_norm": 5.5, + "learning_rate": 0.007547658047513266, + "loss": 10.6689, + "mean_token_accuracy": 9.957719885278493e-05, + "num_tokens": 6017729526.0, + "step": 11771 + }, + { + "epoch": 3.183342347214711, + "grad_norm": 7.3125, + "learning_rate": 0.00754620220951369, + "loss": 3.2971, + "mean_token_accuracy": 0.38269394636154175, + "num_tokens": 6018253758.0, + "step": 11772 + }, + { + "epoch": 3.18361276365603, + "grad_norm": 2.140625, + "learning_rate": 0.007544746477486783, + "loss": 3.0705, + "mean_token_accuracy": 0.40322428941726685, + "num_tokens": 6018777977.0, + "step": 11773 + }, + { + "epoch": 3.1838831800973497, + "grad_norm": 3.109375, + "learning_rate": 0.007543290851477221, + "loss": 3.0694, + "mean_token_accuracy": 0.3951641917228699, + "num_tokens": 6019302236.0, + "step": 11774 + }, + { + "epoch": 3.1841535965386694, + "grad_norm": 3.265625, + "learning_rate": 0.007541835331529656, + "loss": 2.9075, + "mean_token_accuracy": 0.4384761154651642, + "num_tokens": 6019816944.0, + "step": 11775 + }, + { + "epoch": 3.184424012979989, + "grad_norm": 3.265625, + "learning_rate": 0.007540379917688753, + "loss": 2.7539, + "mean_token_accuracy": 0.4166823625564575, + "num_tokens": 6020336605.0, + "step": 11776 + }, + { + "epoch": 3.1846944294213086, + "grad_norm": 2.734375, + "learning_rate": 0.007538924609999172, + "loss": 2.8748, + "mean_token_accuracy": 0.42422908544540405, + "num_tokens": 6020860814.0, + "step": 11777 + }, + { + "epoch": 3.1849648458626283, + "grad_norm": 3.4375, + "learning_rate": 0.007537469408505557, + "loss": 3.0037, + "mean_token_accuracy": 0.4158581495285034, + "num_tokens": 6021385063.0, + "step": 11778 + }, + { + "epoch": 3.185235262303948, + "grad_norm": 2.921875, + "learning_rate": 0.007536014313252566, + "loss": 2.8807, + "mean_token_accuracy": 0.43641555309295654, + "num_tokens": 6021851811.0, + "step": 11779 + }, + { + "epoch": 3.1855056787452676, + "grad_norm": 3.75, + "learning_rate": 0.007534559324284846, + "loss": 2.9559, + "mean_token_accuracy": 0.42307835817337036, + "num_tokens": 6022370154.0, + "step": 11780 + }, + { + "epoch": 3.185776095186587, + "grad_norm": 2.984375, + "learning_rate": 0.007533104441647032, + "loss": 2.8895, + "mean_token_accuracy": 0.42570775747299194, + "num_tokens": 6022894230.0, + "step": 11781 + }, + { + "epoch": 3.186046511627907, + "grad_norm": 3.546875, + "learning_rate": 0.007531649665383773, + "loss": 3.0785, + "mean_token_accuracy": 0.4111674726009369, + "num_tokens": 6023418389.0, + "step": 11782 + }, + { + "epoch": 3.1863169280692265, + "grad_norm": 2.984375, + "learning_rate": 0.007530194995539701, + "loss": 2.9459, + "mean_token_accuracy": 0.42325636744499207, + "num_tokens": 6023942652.0, + "step": 11783 + }, + { + "epoch": 3.186587344510546, + "grad_norm": 3.375, + "learning_rate": 0.007528740432159447, + "loss": 3.0525, + "mean_token_accuracy": 0.4016788601875305, + "num_tokens": 6024441473.0, + "step": 11784 + }, + { + "epoch": 3.186857760951866, + "grad_norm": 2.90625, + "learning_rate": 0.00752728597528765, + "loss": 3.1461, + "mean_token_accuracy": 0.40171536803245544, + "num_tokens": 6024965702.0, + "step": 11785 + }, + { + "epoch": 3.1871281773931854, + "grad_norm": 3.15625, + "learning_rate": 0.007525831624968931, + "loss": 2.7504, + "mean_token_accuracy": 0.4338918924331665, + "num_tokens": 6025489906.0, + "step": 11786 + }, + { + "epoch": 3.187398593834505, + "grad_norm": 2.796875, + "learning_rate": 0.007524377381247909, + "loss": 3.0698, + "mean_token_accuracy": 0.4115925431251526, + "num_tokens": 6026014110.0, + "step": 11787 + }, + { + "epoch": 3.1876690102758247, + "grad_norm": 4.53125, + "learning_rate": 0.007522923244169218, + "loss": 3.0811, + "mean_token_accuracy": 0.4096295237541199, + "num_tokens": 6026538290.0, + "step": 11788 + }, + { + "epoch": 3.1879394267171444, + "grad_norm": 2.46875, + "learning_rate": 0.007521469213777466, + "loss": 2.9695, + "mean_token_accuracy": 0.41316449642181396, + "num_tokens": 6027033013.0, + "step": 11789 + }, + { + "epoch": 3.188209843158464, + "grad_norm": 3.203125, + "learning_rate": 0.0075200152901172615, + "loss": 2.8405, + "mean_token_accuracy": 0.43119192123413086, + "num_tokens": 6027557211.0, + "step": 11790 + }, + { + "epoch": 3.1884802595997837, + "grad_norm": 12.875, + "learning_rate": 0.007518561473233228, + "loss": 10.1791, + "mean_token_accuracy": 0.009165661409497261, + "num_tokens": 6028081344.0, + "step": 11791 + }, + { + "epoch": 3.1887506760411033, + "grad_norm": 6.40625, + "learning_rate": 0.007517107763169966, + "loss": 3.1326, + "mean_token_accuracy": 0.40415143966674805, + "num_tokens": 6028605557.0, + "step": 11792 + }, + { + "epoch": 3.189021092482423, + "grad_norm": 2.859375, + "learning_rate": 0.0075156541599720765, + "loss": 3.0671, + "mean_token_accuracy": 0.39853912591934204, + "num_tokens": 6029129640.0, + "step": 11793 + }, + { + "epoch": 3.1892915089237426, + "grad_norm": 2.8125, + "learning_rate": 0.007514200663684168, + "loss": 2.964, + "mean_token_accuracy": 0.42160993814468384, + "num_tokens": 6029605675.0, + "step": 11794 + }, + { + "epoch": 3.1895619253650622, + "grad_norm": 2.96875, + "learning_rate": 0.007512747274350835, + "loss": 2.8691, + "mean_token_accuracy": 0.4202253818511963, + "num_tokens": 6030113603.0, + "step": 11795 + }, + { + "epoch": 3.189832341806382, + "grad_norm": 3.03125, + "learning_rate": 0.007511293992016668, + "loss": 3.135, + "mean_token_accuracy": 0.41589775681495667, + "num_tokens": 6030581316.0, + "step": 11796 + }, + { + "epoch": 3.1901027582477015, + "grad_norm": 3.4375, + "learning_rate": 0.007509840816726263, + "loss": 3.0582, + "mean_token_accuracy": 0.3860349953174591, + "num_tokens": 6031105535.0, + "step": 11797 + }, + { + "epoch": 3.190373174689021, + "grad_norm": 2.734375, + "learning_rate": 0.007508387748524204, + "loss": 2.9795, + "mean_token_accuracy": 0.44501346349716187, + "num_tokens": 6031567940.0, + "step": 11798 + }, + { + "epoch": 3.190643591130341, + "grad_norm": 2.78125, + "learning_rate": 0.0075069347874550775, + "loss": 3.0202, + "mean_token_accuracy": 0.4277327060699463, + "num_tokens": 6032092219.0, + "step": 11799 + }, + { + "epoch": 3.1909140075716604, + "grad_norm": 3.234375, + "learning_rate": 0.007505481933563465, + "loss": 3.1074, + "mean_token_accuracy": 0.41142189502716064, + "num_tokens": 6032616499.0, + "step": 11800 + }, + { + "epoch": 3.19118442401298, + "grad_norm": 3.375, + "learning_rate": 0.007504029186893941, + "loss": 3.0366, + "mean_token_accuracy": 0.4339820146560669, + "num_tokens": 6033067887.0, + "step": 11801 + }, + { + "epoch": 3.1914548404542997, + "grad_norm": 3.578125, + "learning_rate": 0.007502576547491085, + "loss": 3.1162, + "mean_token_accuracy": 0.4182606339454651, + "num_tokens": 6033592111.0, + "step": 11802 + }, + { + "epoch": 3.1917252568956194, + "grad_norm": 3.203125, + "learning_rate": 0.007501124015399468, + "loss": 2.9794, + "mean_token_accuracy": 0.3913077116012573, + "num_tokens": 6034116347.0, + "step": 11803 + }, + { + "epoch": 3.191995673336939, + "grad_norm": 3.03125, + "learning_rate": 0.00749967159066365, + "loss": 3.0224, + "mean_token_accuracy": 0.4128091335296631, + "num_tokens": 6034640548.0, + "step": 11804 + }, + { + "epoch": 3.1922660897782587, + "grad_norm": 3.25, + "learning_rate": 0.007498219273328206, + "loss": 2.7594, + "mean_token_accuracy": 0.439917653799057, + "num_tokens": 6035116380.0, + "step": 11805 + }, + { + "epoch": 3.1925365062195783, + "grad_norm": 2.5625, + "learning_rate": 0.0074967670634376905, + "loss": 2.8667, + "mean_token_accuracy": 0.4321523904800415, + "num_tokens": 6035640454.0, + "step": 11806 + }, + { + "epoch": 3.192806922660898, + "grad_norm": 3.078125, + "learning_rate": 0.0074953149610366655, + "loss": 2.9916, + "mean_token_accuracy": 0.3908236622810364, + "num_tokens": 6036164547.0, + "step": 11807 + }, + { + "epoch": 3.1930773391022176, + "grad_norm": 3.1875, + "learning_rate": 0.007493862966169682, + "loss": 3.1362, + "mean_token_accuracy": 0.4246661067008972, + "num_tokens": 6036688773.0, + "step": 11808 + }, + { + "epoch": 3.1933477555435372, + "grad_norm": 3.171875, + "learning_rate": 0.007492411078881296, + "loss": 2.9108, + "mean_token_accuracy": 0.47443822026252747, + "num_tokens": 6037148333.0, + "step": 11809 + }, + { + "epoch": 3.193618171984857, + "grad_norm": 2.78125, + "learning_rate": 0.007490959299216049, + "loss": 2.9241, + "mean_token_accuracy": 0.42224979400634766, + "num_tokens": 6037642380.0, + "step": 11810 + }, + { + "epoch": 3.1938885884261765, + "grad_norm": 5.8125, + "learning_rate": 0.0074895076272184944, + "loss": 10.2938, + "mean_token_accuracy": 0.0004907639813609421, + "num_tokens": 6038166656.0, + "step": 11811 + }, + { + "epoch": 3.1941590048674957, + "grad_norm": 7.21875, + "learning_rate": 0.007488056062933171, + "loss": 3.0892, + "mean_token_accuracy": 0.40731585025787354, + "num_tokens": 6038690790.0, + "step": 11812 + }, + { + "epoch": 3.194429421308816, + "grad_norm": 2.15625, + "learning_rate": 0.00748660460640461, + "loss": 2.939, + "mean_token_accuracy": 0.4263850152492523, + "num_tokens": 6039151380.0, + "step": 11813 + }, + { + "epoch": 3.194699837750135, + "grad_norm": 3.109375, + "learning_rate": 0.007485153257677353, + "loss": 2.9507, + "mean_token_accuracy": 0.42894482612609863, + "num_tokens": 6039632859.0, + "step": 11814 + }, + { + "epoch": 3.1949702541914546, + "grad_norm": 3.390625, + "learning_rate": 0.0074837020167959345, + "loss": 2.9653, + "mean_token_accuracy": 0.4204936623573303, + "num_tokens": 6040156997.0, + "step": 11815 + }, + { + "epoch": 3.1952406706327743, + "grad_norm": 3.078125, + "learning_rate": 0.007482250883804873, + "loss": 2.8623, + "mean_token_accuracy": 0.4283277690410614, + "num_tokens": 6040681276.0, + "step": 11816 + }, + { + "epoch": 3.195511087074094, + "grad_norm": 3.46875, + "learning_rate": 0.007480799858748706, + "loss": 2.9898, + "mean_token_accuracy": 0.42633965611457825, + "num_tokens": 6041149813.0, + "step": 11817 + }, + { + "epoch": 3.1957815035154136, + "grad_norm": 2.5, + "learning_rate": 0.007479348941671946, + "loss": 2.8055, + "mean_token_accuracy": 0.42322874069213867, + "num_tokens": 6041674060.0, + "step": 11818 + }, + { + "epoch": 3.196051919956733, + "grad_norm": 3.234375, + "learning_rate": 0.007477898132619112, + "loss": 2.9181, + "mean_token_accuracy": 0.4162299931049347, + "num_tokens": 6042198339.0, + "step": 11819 + }, + { + "epoch": 3.196322336398053, + "grad_norm": 2.859375, + "learning_rate": 0.007476447431634723, + "loss": 3.0825, + "mean_token_accuracy": 0.4061615765094757, + "num_tokens": 6042722581.0, + "step": 11820 + }, + { + "epoch": 3.1965927528393725, + "grad_norm": 2.875, + "learning_rate": 0.007474996838763291, + "loss": 2.7589, + "mean_token_accuracy": 0.43150222301483154, + "num_tokens": 6043246669.0, + "step": 11821 + }, + { + "epoch": 3.196863169280692, + "grad_norm": 2.984375, + "learning_rate": 0.007473546354049316, + "loss": 2.9022, + "mean_token_accuracy": 0.42155295610427856, + "num_tokens": 6043770735.0, + "step": 11822 + }, + { + "epoch": 3.197133585722012, + "grad_norm": 3.609375, + "learning_rate": 0.007472095977537314, + "loss": 2.9924, + "mean_token_accuracy": 0.41271543502807617, + "num_tokens": 6044294910.0, + "step": 11823 + }, + { + "epoch": 3.1974040021633314, + "grad_norm": 3.4375, + "learning_rate": 0.007470645709271782, + "loss": 2.9559, + "mean_token_accuracy": 0.4276784062385559, + "num_tokens": 6044819173.0, + "step": 11824 + }, + { + "epoch": 3.197674418604651, + "grad_norm": 3.453125, + "learning_rate": 0.0074691955492972185, + "loss": 2.8811, + "mean_token_accuracy": 0.4501432180404663, + "num_tokens": 6045287219.0, + "step": 11825 + }, + { + "epoch": 3.1979448350459707, + "grad_norm": 2.6875, + "learning_rate": 0.007467745497658121, + "loss": 3.0785, + "mean_token_accuracy": 0.4121561348438263, + "num_tokens": 6045803690.0, + "step": 11826 + }, + { + "epoch": 3.1982152514872904, + "grad_norm": 2.625, + "learning_rate": 0.007466295554398974, + "loss": 2.7732, + "mean_token_accuracy": 0.42583131790161133, + "num_tokens": 6046327924.0, + "step": 11827 + }, + { + "epoch": 3.19848566792861, + "grad_norm": 2.390625, + "learning_rate": 0.007464845719564275, + "loss": 2.8361, + "mean_token_accuracy": 0.44627514481544495, + "num_tokens": 6046802582.0, + "step": 11828 + }, + { + "epoch": 3.1987560843699296, + "grad_norm": 3.359375, + "learning_rate": 0.007463395993198507, + "loss": 2.8741, + "mean_token_accuracy": 0.4534778594970703, + "num_tokens": 6047264581.0, + "step": 11829 + }, + { + "epoch": 3.1990265008112493, + "grad_norm": 3.328125, + "learning_rate": 0.007461946375346147, + "loss": 3.0129, + "mean_token_accuracy": 0.41957980394363403, + "num_tokens": 6047788864.0, + "step": 11830 + }, + { + "epoch": 3.199296917252569, + "grad_norm": 9.0, + "learning_rate": 0.007460496866051678, + "loss": 9.0264, + "mean_token_accuracy": 0.02614554949104786, + "num_tokens": 6048265894.0, + "step": 11831 + }, + { + "epoch": 3.1995673336938886, + "grad_norm": 61.0, + "learning_rate": 0.007459047465359576, + "loss": 3.4736, + "mean_token_accuracy": 0.3661867380142212, + "num_tokens": 6048790154.0, + "step": 11832 + }, + { + "epoch": 3.199837750135208, + "grad_norm": 11.8125, + "learning_rate": 0.007457598173314308, + "loss": 3.5038, + "mean_token_accuracy": 0.3528871536254883, + "num_tokens": 6049314264.0, + "step": 11833 + }, + { + "epoch": 3.200108166576528, + "grad_norm": 3.328125, + "learning_rate": 0.00745614898996035, + "loss": 3.1603, + "mean_token_accuracy": 0.4132423400878906, + "num_tokens": 6049722546.0, + "step": 11834 + }, + { + "epoch": 3.2003785830178475, + "grad_norm": 2.234375, + "learning_rate": 0.00745469991534216, + "loss": 2.8958, + "mean_token_accuracy": 0.414224237203598, + "num_tokens": 6050237991.0, + "step": 11835 + }, + { + "epoch": 3.200648999459167, + "grad_norm": 2.75, + "learning_rate": 0.0074532509495042, + "loss": 3.0949, + "mean_token_accuracy": 0.41944223642349243, + "num_tokens": 6050747462.0, + "step": 11836 + }, + { + "epoch": 3.200919415900487, + "grad_norm": 3.4375, + "learning_rate": 0.0074518020924909355, + "loss": 3.0981, + "mean_token_accuracy": 0.42488059401512146, + "num_tokens": 6051236440.0, + "step": 11837 + }, + { + "epoch": 3.2011898323418064, + "grad_norm": 3.328125, + "learning_rate": 0.0074503533443468165, + "loss": 3.0586, + "mean_token_accuracy": 0.41243433952331543, + "num_tokens": 6051760561.0, + "step": 11838 + }, + { + "epoch": 3.201460248783126, + "grad_norm": 4.03125, + "learning_rate": 0.007448904705116293, + "loss": 3.1894, + "mean_token_accuracy": 0.44518643617630005, + "num_tokens": 6052220096.0, + "step": 11839 + }, + { + "epoch": 3.2017306652244457, + "grad_norm": 4.0, + "learning_rate": 0.007447456174843818, + "loss": 3.1676, + "mean_token_accuracy": 0.3954690396785736, + "num_tokens": 6052744340.0, + "step": 11840 + }, + { + "epoch": 3.2020010816657654, + "grad_norm": 5.75, + "learning_rate": 0.007446007753573835, + "loss": 2.7882, + "mean_token_accuracy": 0.45677506923675537, + "num_tokens": 6053268606.0, + "step": 11841 + }, + { + "epoch": 3.202271498107085, + "grad_norm": 1.8671875, + "learning_rate": 0.0074445594413507865, + "loss": 3.0581, + "mean_token_accuracy": 0.4304031431674957, + "num_tokens": 6053788472.0, + "step": 11842 + }, + { + "epoch": 3.2025419145484046, + "grad_norm": 3.65625, + "learning_rate": 0.007443111238219109, + "loss": 2.9661, + "mean_token_accuracy": 0.43093955516815186, + "num_tokens": 6054312487.0, + "step": 11843 + }, + { + "epoch": 3.2028123309897243, + "grad_norm": 3.21875, + "learning_rate": 0.007441663144223243, + "loss": 3.1192, + "mean_token_accuracy": 0.3983078598976135, + "num_tokens": 6054815242.0, + "step": 11844 + }, + { + "epoch": 3.203082747431044, + "grad_norm": 4.5625, + "learning_rate": 0.007440215159407609, + "loss": 3.0145, + "mean_token_accuracy": 0.4134845435619354, + "num_tokens": 6055339435.0, + "step": 11845 + }, + { + "epoch": 3.2033531638723636, + "grad_norm": 2.796875, + "learning_rate": 0.007438767283816648, + "loss": 2.8063, + "mean_token_accuracy": 0.43539416790008545, + "num_tokens": 6055823813.0, + "step": 11846 + }, + { + "epoch": 3.203623580313683, + "grad_norm": 2.828125, + "learning_rate": 0.007437319517494774, + "loss": 3.0495, + "mean_token_accuracy": 0.3984760642051697, + "num_tokens": 6056348080.0, + "step": 11847 + }, + { + "epoch": 3.203893996755003, + "grad_norm": 3.109375, + "learning_rate": 0.007435871860486421, + "loss": 2.9412, + "mean_token_accuracy": 0.43887531757354736, + "num_tokens": 6056872242.0, + "step": 11848 + }, + { + "epoch": 3.2041644131963225, + "grad_norm": 3.140625, + "learning_rate": 0.007434424312835997, + "loss": 3.0999, + "mean_token_accuracy": 0.39057809114456177, + "num_tokens": 6057396359.0, + "step": 11849 + }, + { + "epoch": 3.204434829637642, + "grad_norm": 3.03125, + "learning_rate": 0.00743297687458792, + "loss": 3.1428, + "mean_token_accuracy": 0.417293906211853, + "num_tokens": 6057920595.0, + "step": 11850 + }, + { + "epoch": 3.204705246078962, + "grad_norm": 15.25, + "learning_rate": 0.007431529545786606, + "loss": 13.7655, + "mean_token_accuracy": 0.010402634739875793, + "num_tokens": 6058404717.0, + "step": 11851 + }, + { + "epoch": 3.2049756625202814, + "grad_norm": 5.6875, + "learning_rate": 0.007430082326476457, + "loss": 3.0542, + "mean_token_accuracy": 0.40214312076568604, + "num_tokens": 6058928980.0, + "step": 11852 + }, + { + "epoch": 3.2052460789616006, + "grad_norm": 3.046875, + "learning_rate": 0.00742863521670188, + "loss": 2.9377, + "mean_token_accuracy": 0.40249091386795044, + "num_tokens": 6059453112.0, + "step": 11853 + }, + { + "epoch": 3.2055164954029207, + "grad_norm": 2.484375, + "learning_rate": 0.007427188216507279, + "loss": 3.0415, + "mean_token_accuracy": 0.4165630340576172, + "num_tokens": 6059977119.0, + "step": 11854 + }, + { + "epoch": 3.20578691184424, + "grad_norm": 11.0625, + "learning_rate": 0.007425741325937052, + "loss": 2.7584, + "mean_token_accuracy": 0.46487176418304443, + "num_tokens": 6060477051.0, + "step": 11855 + }, + { + "epoch": 3.2060573282855596, + "grad_norm": 2.578125, + "learning_rate": 0.007424294545035588, + "loss": 3.0671, + "mean_token_accuracy": 0.4152889847755432, + "num_tokens": 6060973825.0, + "step": 11856 + }, + { + "epoch": 3.206327744726879, + "grad_norm": 2.9375, + "learning_rate": 0.007422847873847285, + "loss": 2.7045, + "mean_token_accuracy": 0.4147684574127197, + "num_tokens": 6061419697.0, + "step": 11857 + }, + { + "epoch": 3.206598161168199, + "grad_norm": 2.296875, + "learning_rate": 0.007421401312416531, + "loss": 3.0354, + "mean_token_accuracy": 0.4594017267227173, + "num_tokens": 6061875761.0, + "step": 11858 + }, + { + "epoch": 3.2068685776095185, + "grad_norm": 2.921875, + "learning_rate": 0.007419954860787705, + "loss": 3.1296, + "mean_token_accuracy": 0.41109734773635864, + "num_tokens": 6062399894.0, + "step": 11859 + }, + { + "epoch": 3.207138994050838, + "grad_norm": 3.53125, + "learning_rate": 0.007418508519005196, + "loss": 3.1066, + "mean_token_accuracy": 0.40572264790534973, + "num_tokens": 6062924173.0, + "step": 11860 + }, + { + "epoch": 3.2074094104921578, + "grad_norm": 2.984375, + "learning_rate": 0.0074170622871133785, + "loss": 2.9993, + "mean_token_accuracy": 0.41791409254074097, + "num_tokens": 6063448410.0, + "step": 11861 + }, + { + "epoch": 3.2076798269334774, + "grad_norm": 3.390625, + "learning_rate": 0.00741561616515662, + "loss": 3.0834, + "mean_token_accuracy": 0.4027126133441925, + "num_tokens": 6063953665.0, + "step": 11862 + }, + { + "epoch": 3.207950243374797, + "grad_norm": 2.34375, + "learning_rate": 0.007414170153179303, + "loss": 2.8867, + "mean_token_accuracy": 0.45016756653785706, + "num_tokens": 6064388056.0, + "step": 11863 + }, + { + "epoch": 3.2082206598161167, + "grad_norm": 2.78125, + "learning_rate": 0.007412724251225793, + "loss": 2.9635, + "mean_token_accuracy": 0.4223136305809021, + "num_tokens": 6064882827.0, + "step": 11864 + }, + { + "epoch": 3.2084910762574363, + "grad_norm": 2.84375, + "learning_rate": 0.007411278459340447, + "loss": 3.0897, + "mean_token_accuracy": 0.41074931621551514, + "num_tokens": 6065390328.0, + "step": 11865 + }, + { + "epoch": 3.208761492698756, + "grad_norm": 3.265625, + "learning_rate": 0.007409832777567638, + "loss": 3.1103, + "mean_token_accuracy": 0.416670024394989, + "num_tokens": 6065877779.0, + "step": 11866 + }, + { + "epoch": 3.2090319091400756, + "grad_norm": 2.828125, + "learning_rate": 0.007408387205951716, + "loss": 2.9627, + "mean_token_accuracy": 0.42040660977363586, + "num_tokens": 6066378841.0, + "step": 11867 + }, + { + "epoch": 3.2093023255813953, + "grad_norm": 2.6875, + "learning_rate": 0.007406941744537032, + "loss": 2.989, + "mean_token_accuracy": 0.42583775520324707, + "num_tokens": 6066886012.0, + "step": 11868 + }, + { + "epoch": 3.209572742022715, + "grad_norm": 2.8125, + "learning_rate": 0.007405496393367945, + "loss": 2.8412, + "mean_token_accuracy": 0.41259193420410156, + "num_tokens": 6067410246.0, + "step": 11869 + }, + { + "epoch": 3.2098431584640346, + "grad_norm": 2.890625, + "learning_rate": 0.007404051152488798, + "loss": 3.1254, + "mean_token_accuracy": 0.41984450817108154, + "num_tokens": 6067897784.0, + "step": 11870 + }, + { + "epoch": 3.210113574905354, + "grad_norm": 17.125, + "learning_rate": 0.007402606021943939, + "loss": 9.5252, + "mean_token_accuracy": 0.011077485978603363, + "num_tokens": 6068421969.0, + "step": 11871 + }, + { + "epoch": 3.210383991346674, + "grad_norm": 6.375, + "learning_rate": 0.0074011610017777054, + "loss": 3.1704, + "mean_token_accuracy": 0.378722220659256, + "num_tokens": 6068870638.0, + "step": 11872 + }, + { + "epoch": 3.2106544077879935, + "grad_norm": 7.46875, + "learning_rate": 0.0073997160920344345, + "loss": 2.6671, + "mean_token_accuracy": 0.4440756142139435, + "num_tokens": 6069394788.0, + "step": 11873 + }, + { + "epoch": 3.210924824229313, + "grad_norm": 2.484375, + "learning_rate": 0.007398271292758465, + "loss": 2.8387, + "mean_token_accuracy": 0.4280305504798889, + "num_tokens": 6069918976.0, + "step": 11874 + }, + { + "epoch": 3.2111952406706328, + "grad_norm": 2.4375, + "learning_rate": 0.0073968266039941225, + "loss": 3.0097, + "mean_token_accuracy": 0.4176705777645111, + "num_tokens": 6070443249.0, + "step": 11875 + }, + { + "epoch": 3.2114656571119524, + "grad_norm": 3.21875, + "learning_rate": 0.007395382025785733, + "loss": 2.859, + "mean_token_accuracy": 0.41048699617385864, + "num_tokens": 6070967426.0, + "step": 11876 + }, + { + "epoch": 3.211736073553272, + "grad_norm": 2.578125, + "learning_rate": 0.007393937558177628, + "loss": 2.9824, + "mean_token_accuracy": 0.43031811714172363, + "num_tokens": 6071443047.0, + "step": 11877 + }, + { + "epoch": 3.2120064899945917, + "grad_norm": 3.109375, + "learning_rate": 0.007392493201214121, + "loss": 2.847, + "mean_token_accuracy": 0.41469961404800415, + "num_tokens": 6071967172.0, + "step": 11878 + }, + { + "epoch": 3.2122769064359114, + "grad_norm": 2.375, + "learning_rate": 0.00739104895493953, + "loss": 3.0273, + "mean_token_accuracy": 0.4138256907463074, + "num_tokens": 6072491397.0, + "step": 11879 + }, + { + "epoch": 3.212547322877231, + "grad_norm": 3.609375, + "learning_rate": 0.0073896048193981744, + "loss": 3.0086, + "mean_token_accuracy": 0.4141382575035095, + "num_tokens": 6073015664.0, + "step": 11880 + }, + { + "epoch": 3.2128177393185506, + "grad_norm": 3.046875, + "learning_rate": 0.00738816079463436, + "loss": 3.0056, + "mean_token_accuracy": 0.40824151039123535, + "num_tokens": 6073539863.0, + "step": 11881 + }, + { + "epoch": 3.2130881557598703, + "grad_norm": 3.546875, + "learning_rate": 0.00738671688069239, + "loss": 2.9751, + "mean_token_accuracy": 0.4162330627441406, + "num_tokens": 6074064052.0, + "step": 11882 + }, + { + "epoch": 3.21335857220119, + "grad_norm": 2.90625, + "learning_rate": 0.007385273077616577, + "loss": 2.9906, + "mean_token_accuracy": 0.41843268275260925, + "num_tokens": 6074570221.0, + "step": 11883 + }, + { + "epoch": 3.2136289886425096, + "grad_norm": 2.71875, + "learning_rate": 0.007383829385451213, + "loss": 3.0279, + "mean_token_accuracy": 0.4037869870662689, + "num_tokens": 6075054924.0, + "step": 11884 + }, + { + "epoch": 3.213899405083829, + "grad_norm": 2.5, + "learning_rate": 0.007382385804240599, + "loss": 2.9881, + "mean_token_accuracy": 0.4187447726726532, + "num_tokens": 6075579205.0, + "step": 11885 + }, + { + "epoch": 3.214169821525149, + "grad_norm": 2.75, + "learning_rate": 0.007380942334029026, + "loss": 2.9349, + "mean_token_accuracy": 0.4242819845676422, + "num_tokens": 6076103238.0, + "step": 11886 + }, + { + "epoch": 3.2144402379664685, + "grad_norm": 2.90625, + "learning_rate": 0.0073794989748607875, + "loss": 2.9122, + "mean_token_accuracy": 0.4177617132663727, + "num_tokens": 6076627495.0, + "step": 11887 + }, + { + "epoch": 3.214710654407788, + "grad_norm": 3.296875, + "learning_rate": 0.007378055726780164, + "loss": 2.6774, + "mean_token_accuracy": 0.420471727848053, + "num_tokens": 6077151743.0, + "step": 11888 + }, + { + "epoch": 3.214981070849108, + "grad_norm": 2.34375, + "learning_rate": 0.007376612589831446, + "loss": 3.0, + "mean_token_accuracy": 0.4318385124206543, + "num_tokens": 6077666901.0, + "step": 11889 + }, + { + "epoch": 3.2152514872904274, + "grad_norm": 3.546875, + "learning_rate": 0.007375169564058908, + "loss": 3.069, + "mean_token_accuracy": 0.4063871204853058, + "num_tokens": 6078191114.0, + "step": 11890 + }, + { + "epoch": 3.215521903731747, + "grad_norm": 6.46875, + "learning_rate": 0.0073737266495068265, + "loss": 10.2453, + "mean_token_accuracy": 1.0065703463624232e-05, + "num_tokens": 6078715223.0, + "step": 11891 + }, + { + "epoch": 3.2157923201730667, + "grad_norm": 5.71875, + "learning_rate": 0.007372283846219477, + "loss": 3.1046, + "mean_token_accuracy": 0.39571741223335266, + "num_tokens": 6079239451.0, + "step": 11892 + }, + { + "epoch": 3.2160627366143864, + "grad_norm": 2.375, + "learning_rate": 0.0073708411542411255, + "loss": 3.0223, + "mean_token_accuracy": 0.4089496433734894, + "num_tokens": 6079763739.0, + "step": 11893 + }, + { + "epoch": 3.2163331530557056, + "grad_norm": 3.3125, + "learning_rate": 0.007369398573616042, + "loss": 3.1208, + "mean_token_accuracy": 0.41750597953796387, + "num_tokens": 6080251831.0, + "step": 11894 + }, + { + "epoch": 3.2166035694970256, + "grad_norm": 3.59375, + "learning_rate": 0.007367956104388489, + "loss": 3.1856, + "mean_token_accuracy": 0.4128182530403137, + "num_tokens": 6080744879.0, + "step": 11895 + }, + { + "epoch": 3.216873985938345, + "grad_norm": 2.625, + "learning_rate": 0.007366513746602719, + "loss": 2.8875, + "mean_token_accuracy": 0.414908230304718, + "num_tokens": 6081269071.0, + "step": 11896 + }, + { + "epoch": 3.2171444023796645, + "grad_norm": 2.8125, + "learning_rate": 0.007365071500302998, + "loss": 2.9731, + "mean_token_accuracy": 0.4195716381072998, + "num_tokens": 6081698156.0, + "step": 11897 + }, + { + "epoch": 3.217414818820984, + "grad_norm": 2.8125, + "learning_rate": 0.007363629365533569, + "loss": 2.9512, + "mean_token_accuracy": 0.42659154534339905, + "num_tokens": 6082222391.0, + "step": 11898 + }, + { + "epoch": 3.2176852352623038, + "grad_norm": 3.515625, + "learning_rate": 0.007362187342338686, + "loss": 2.9666, + "mean_token_accuracy": 0.40645670890808105, + "num_tokens": 6082746526.0, + "step": 11899 + }, + { + "epoch": 3.2179556517036234, + "grad_norm": 3.15625, + "learning_rate": 0.007360745430762594, + "loss": 2.9863, + "mean_token_accuracy": 0.41491615772247314, + "num_tokens": 6083270802.0, + "step": 11900 + }, + { + "epoch": 3.218226068144943, + "grad_norm": 3.03125, + "learning_rate": 0.007359303630849537, + "loss": 3.0868, + "mean_token_accuracy": 0.405288428068161, + "num_tokens": 6083795081.0, + "step": 11901 + }, + { + "epoch": 3.2184964845862627, + "grad_norm": 2.9375, + "learning_rate": 0.007357861942643745, + "loss": 3.1626, + "mean_token_accuracy": 0.4076642394065857, + "num_tokens": 6084309466.0, + "step": 11902 + }, + { + "epoch": 3.2187669010275823, + "grad_norm": 3.125, + "learning_rate": 0.007356420366189465, + "loss": 2.9271, + "mean_token_accuracy": 0.4046579599380493, + "num_tokens": 6084833512.0, + "step": 11903 + }, + { + "epoch": 3.219037317468902, + "grad_norm": 2.703125, + "learning_rate": 0.007354978901530922, + "loss": 2.7748, + "mean_token_accuracy": 0.44220292568206787, + "num_tokens": 6085299006.0, + "step": 11904 + }, + { + "epoch": 3.2193077339102216, + "grad_norm": 3.0, + "learning_rate": 0.007353537548712343, + "loss": 2.8465, + "mean_token_accuracy": 0.4218916893005371, + "num_tokens": 6085823157.0, + "step": 11905 + }, + { + "epoch": 3.2195781503515413, + "grad_norm": 2.9375, + "learning_rate": 0.007352096307777958, + "loss": 3.0254, + "mean_token_accuracy": 0.4312433898448944, + "num_tokens": 6086302899.0, + "step": 11906 + }, + { + "epoch": 3.219848566792861, + "grad_norm": 2.6875, + "learning_rate": 0.007350655178771985, + "loss": 2.8744, + "mean_token_accuracy": 0.4165741801261902, + "num_tokens": 6086826950.0, + "step": 11907 + }, + { + "epoch": 3.2201189832341806, + "grad_norm": 3.015625, + "learning_rate": 0.007349214161738642, + "loss": 2.8437, + "mean_token_accuracy": 0.4250231981277466, + "num_tokens": 6087351137.0, + "step": 11908 + }, + { + "epoch": 3.2203893996755, + "grad_norm": 2.796875, + "learning_rate": 0.007347773256722148, + "loss": 2.996, + "mean_token_accuracy": 0.4208452105522156, + "num_tokens": 6087875328.0, + "step": 11909 + }, + { + "epoch": 3.22065981611682, + "grad_norm": 2.46875, + "learning_rate": 0.007346332463766711, + "loss": 2.8666, + "mean_token_accuracy": 0.4352805018424988, + "num_tokens": 6088342415.0, + "step": 11910 + }, + { + "epoch": 3.2209302325581395, + "grad_norm": 51.5, + "learning_rate": 0.007344891782916536, + "loss": 11.979, + "mean_token_accuracy": 0.012697870843112469, + "num_tokens": 6088866667.0, + "step": 11911 + }, + { + "epoch": 3.221200648999459, + "grad_norm": 6.8125, + "learning_rate": 0.0073434512142158325, + "loss": 3.1354, + "mean_token_accuracy": 0.4026794135570526, + "num_tokens": 6089390831.0, + "step": 11912 + }, + { + "epoch": 3.2214710654407788, + "grad_norm": 2.171875, + "learning_rate": 0.007342010757708797, + "loss": 2.7959, + "mean_token_accuracy": 0.4197787940502167, + "num_tokens": 6089915107.0, + "step": 11913 + }, + { + "epoch": 3.2217414818820984, + "grad_norm": 2.71875, + "learning_rate": 0.0073405704134396265, + "loss": 2.7768, + "mean_token_accuracy": 0.4492981433868408, + "num_tokens": 6090439367.0, + "step": 11914 + }, + { + "epoch": 3.222011898323418, + "grad_norm": 2.71875, + "learning_rate": 0.007339130181452522, + "loss": 2.836, + "mean_token_accuracy": 0.4095616936683655, + "num_tokens": 6090927642.0, + "step": 11915 + }, + { + "epoch": 3.2222823147647377, + "grad_norm": 3.359375, + "learning_rate": 0.007337690061791666, + "loss": 2.804, + "mean_token_accuracy": 0.4390990436077118, + "num_tokens": 6091451916.0, + "step": 11916 + }, + { + "epoch": 3.2225527312060573, + "grad_norm": 3.140625, + "learning_rate": 0.007336250054501252, + "loss": 2.8509, + "mean_token_accuracy": 0.42371901869773865, + "num_tokens": 6091976037.0, + "step": 11917 + }, + { + "epoch": 3.222823147647377, + "grad_norm": 2.921875, + "learning_rate": 0.007334810159625463, + "loss": 2.9843, + "mean_token_accuracy": 0.397965669631958, + "num_tokens": 6092500304.0, + "step": 11918 + }, + { + "epoch": 3.2230935640886966, + "grad_norm": 2.828125, + "learning_rate": 0.007333370377208471, + "loss": 2.8972, + "mean_token_accuracy": 0.4270346164703369, + "num_tokens": 6093024547.0, + "step": 11919 + }, + { + "epoch": 3.2233639805300163, + "grad_norm": 3.40625, + "learning_rate": 0.007331930707294463, + "loss": 2.9286, + "mean_token_accuracy": 0.42821627855300903, + "num_tokens": 6093485613.0, + "step": 11920 + }, + { + "epoch": 3.223634396971336, + "grad_norm": 2.8125, + "learning_rate": 0.007330491149927605, + "loss": 2.9097, + "mean_token_accuracy": 0.4276629090309143, + "num_tokens": 6094009833.0, + "step": 11921 + }, + { + "epoch": 3.2239048134126556, + "grad_norm": 2.828125, + "learning_rate": 0.00732905170515207, + "loss": 2.8785, + "mean_token_accuracy": 0.4305011034011841, + "num_tokens": 6094534025.0, + "step": 11922 + }, + { + "epoch": 3.224175229853975, + "grad_norm": 2.9375, + "learning_rate": 0.007327612373012025, + "loss": 2.881, + "mean_token_accuracy": 0.4246487021446228, + "num_tokens": 6095056312.0, + "step": 11923 + }, + { + "epoch": 3.224445646295295, + "grad_norm": 2.390625, + "learning_rate": 0.007326173153551634, + "loss": 2.9539, + "mean_token_accuracy": 0.46925467252731323, + "num_tokens": 6095485282.0, + "step": 11924 + }, + { + "epoch": 3.2247160627366145, + "grad_norm": 2.75, + "learning_rate": 0.007324734046815051, + "loss": 2.7673, + "mean_token_accuracy": 0.4375058710575104, + "num_tokens": 6096009449.0, + "step": 11925 + }, + { + "epoch": 3.224986479177934, + "grad_norm": 3.5, + "learning_rate": 0.0073232950528464395, + "loss": 3.0572, + "mean_token_accuracy": 0.4313199520111084, + "num_tokens": 6096499870.0, + "step": 11926 + }, + { + "epoch": 3.2252568956192538, + "grad_norm": 3.140625, + "learning_rate": 0.007321856171689947, + "loss": 2.9286, + "mean_token_accuracy": 0.4034534692764282, + "num_tokens": 6097024134.0, + "step": 11927 + }, + { + "epoch": 3.2255273120605734, + "grad_norm": 3.03125, + "learning_rate": 0.007320417403389721, + "loss": 2.9981, + "mean_token_accuracy": 0.4244178831577301, + "num_tokens": 6097546358.0, + "step": 11928 + }, + { + "epoch": 3.225797728501893, + "grad_norm": 3.140625, + "learning_rate": 0.007318978747989915, + "loss": 2.8875, + "mean_token_accuracy": 0.4342316687107086, + "num_tokens": 6097966120.0, + "step": 11929 + }, + { + "epoch": 3.2260681449432127, + "grad_norm": 2.75, + "learning_rate": 0.007317540205534663, + "loss": 2.9175, + "mean_token_accuracy": 0.4567689895629883, + "num_tokens": 6098435381.0, + "step": 11930 + }, + { + "epoch": 3.2263385613845323, + "grad_norm": 125.0, + "learning_rate": 0.007316101776068107, + "loss": 11.1417, + "mean_token_accuracy": 0.019585270434617996, + "num_tokens": 6098959443.0, + "step": 11931 + }, + { + "epoch": 3.226608977825852, + "grad_norm": 6.84375, + "learning_rate": 0.007314663459634385, + "loss": 3.1718, + "mean_token_accuracy": 0.3906562924385071, + "num_tokens": 6099483721.0, + "step": 11932 + }, + { + "epoch": 3.2268793942671716, + "grad_norm": 2.234375, + "learning_rate": 0.007313225256277623, + "loss": 2.9483, + "mean_token_accuracy": 0.41157883405685425, + "num_tokens": 6100007802.0, + "step": 11933 + }, + { + "epoch": 3.2271498107084913, + "grad_norm": 2.734375, + "learning_rate": 0.007311787166041952, + "loss": 2.9539, + "mean_token_accuracy": 0.4363974928855896, + "num_tokens": 6100471198.0, + "step": 11934 + }, + { + "epoch": 3.227420227149811, + "grad_norm": 3.34375, + "learning_rate": 0.007310349188971499, + "loss": 3.0693, + "mean_token_accuracy": 0.393608033657074, + "num_tokens": 6100995293.0, + "step": 11935 + }, + { + "epoch": 3.2276906435911306, + "grad_norm": 3.59375, + "learning_rate": 0.0073089113251103835, + "loss": 3.0401, + "mean_token_accuracy": 0.41471678018569946, + "num_tokens": 6101500212.0, + "step": 11936 + }, + { + "epoch": 3.2279610600324498, + "grad_norm": 3.0625, + "learning_rate": 0.007307473574502719, + "loss": 2.818, + "mean_token_accuracy": 0.4342876374721527, + "num_tokens": 6101971839.0, + "step": 11937 + }, + { + "epoch": 3.2282314764737694, + "grad_norm": 3.5, + "learning_rate": 0.00730603593719263, + "loss": 3.2237, + "mean_token_accuracy": 0.40179207921028137, + "num_tokens": 6102435636.0, + "step": 11938 + }, + { + "epoch": 3.228501892915089, + "grad_norm": 2.71875, + "learning_rate": 0.007304598413224217, + "loss": 2.8391, + "mean_token_accuracy": 0.4020761251449585, + "num_tokens": 6102959882.0, + "step": 11939 + }, + { + "epoch": 3.2287723093564087, + "grad_norm": 2.875, + "learning_rate": 0.007303161002641596, + "loss": 2.8541, + "mean_token_accuracy": 0.4233241081237793, + "num_tokens": 6103442450.0, + "step": 11940 + }, + { + "epoch": 3.2290427257977283, + "grad_norm": 2.609375, + "learning_rate": 0.007301723705488865, + "loss": 2.8705, + "mean_token_accuracy": 0.43427473306655884, + "num_tokens": 6103966682.0, + "step": 11941 + }, + { + "epoch": 3.229313142239048, + "grad_norm": 2.78125, + "learning_rate": 0.007300286521810127, + "loss": 2.9863, + "mean_token_accuracy": 0.4320712685585022, + "num_tokens": 6104418814.0, + "step": 11942 + }, + { + "epoch": 3.2295835586803676, + "grad_norm": 2.515625, + "learning_rate": 0.007298849451649479, + "loss": 2.8995, + "mean_token_accuracy": 0.41955476999282837, + "num_tokens": 6104942973.0, + "step": 11943 + }, + { + "epoch": 3.2298539751216873, + "grad_norm": 2.515625, + "learning_rate": 0.007297412495051015, + "loss": 3.1994, + "mean_token_accuracy": 0.3955283761024475, + "num_tokens": 6105467226.0, + "step": 11944 + }, + { + "epoch": 3.230124391563007, + "grad_norm": 3.59375, + "learning_rate": 0.00729597565205882, + "loss": 2.955, + "mean_token_accuracy": 0.42516663670539856, + "num_tokens": 6105991445.0, + "step": 11945 + }, + { + "epoch": 3.2303948080043265, + "grad_norm": 3.078125, + "learning_rate": 0.007294538922716989, + "loss": 2.847, + "mean_token_accuracy": 0.4199737310409546, + "num_tokens": 6106515566.0, + "step": 11946 + }, + { + "epoch": 3.230665224445646, + "grad_norm": 3.125, + "learning_rate": 0.007293102307069602, + "loss": 3.1062, + "mean_token_accuracy": 0.41412052512168884, + "num_tokens": 6107039741.0, + "step": 11947 + }, + { + "epoch": 3.230935640886966, + "grad_norm": 3.15625, + "learning_rate": 0.0072916658051607325, + "loss": 2.9899, + "mean_token_accuracy": 0.4241092801094055, + "num_tokens": 6107544112.0, + "step": 11948 + }, + { + "epoch": 3.2312060573282855, + "grad_norm": 3.46875, + "learning_rate": 0.0072902294170344665, + "loss": 2.8706, + "mean_token_accuracy": 0.43362218141555786, + "num_tokens": 6108068282.0, + "step": 11949 + }, + { + "epoch": 3.231476473769605, + "grad_norm": 2.953125, + "learning_rate": 0.007288793142734871, + "loss": 2.8906, + "mean_token_accuracy": 0.4440164566040039, + "num_tokens": 6108557302.0, + "step": 11950 + }, + { + "epoch": 3.2317468902109248, + "grad_norm": 46.5, + "learning_rate": 0.007287356982306013, + "loss": 10.3154, + "mean_token_accuracy": 0.005597040057182312, + "num_tokens": 6109044873.0, + "step": 11951 + }, + { + "epoch": 3.2320173066522444, + "grad_norm": 7.46875, + "learning_rate": 0.007285920935791963, + "loss": 3.2692, + "mean_token_accuracy": 0.38191166520118713, + "num_tokens": 6109569040.0, + "step": 11952 + }, + { + "epoch": 3.232287723093564, + "grad_norm": 2.0, + "learning_rate": 0.007284485003236783, + "loss": 3.1141, + "mean_token_accuracy": 0.39576566219329834, + "num_tokens": 6110093267.0, + "step": 11953 + }, + { + "epoch": 3.2325581395348837, + "grad_norm": 2.9375, + "learning_rate": 0.007283049184684523, + "loss": 3.1783, + "mean_token_accuracy": 0.40026649832725525, + "num_tokens": 6110591426.0, + "step": 11954 + }, + { + "epoch": 3.2328285559762033, + "grad_norm": 3.421875, + "learning_rate": 0.007281613480179252, + "loss": 2.9598, + "mean_token_accuracy": 0.40765106678009033, + "num_tokens": 6111115676.0, + "step": 11955 + }, + { + "epoch": 3.233098972417523, + "grad_norm": 2.828125, + "learning_rate": 0.007280177889765011, + "loss": 3.1444, + "mean_token_accuracy": 0.4030550718307495, + "num_tokens": 6111639947.0, + "step": 11956 + }, + { + "epoch": 3.2333693888588426, + "grad_norm": 3.140625, + "learning_rate": 0.00727874241348585, + "loss": 2.8263, + "mean_token_accuracy": 0.434861958026886, + "num_tokens": 6112164226.0, + "step": 11957 + }, + { + "epoch": 3.2336398053001623, + "grad_norm": 2.296875, + "learning_rate": 0.007277307051385819, + "loss": 2.9644, + "mean_token_accuracy": 0.4156053066253662, + "num_tokens": 6112688424.0, + "step": 11958 + }, + { + "epoch": 3.233910221741482, + "grad_norm": 2.859375, + "learning_rate": 0.007275871803508955, + "loss": 2.811, + "mean_token_accuracy": 0.4048348665237427, + "num_tokens": 6113212701.0, + "step": 11959 + }, + { + "epoch": 3.2341806381828015, + "grad_norm": 2.25, + "learning_rate": 0.007274436669899294, + "loss": 3.1602, + "mean_token_accuracy": 0.39330625534057617, + "num_tokens": 6113693938.0, + "step": 11960 + }, + { + "epoch": 3.234451054624121, + "grad_norm": 4.65625, + "learning_rate": 0.007273001650600875, + "loss": 2.5686, + "mean_token_accuracy": 0.4698520004749298, + "num_tokens": 6114218148.0, + "step": 11961 + }, + { + "epoch": 3.234721471065441, + "grad_norm": 2.796875, + "learning_rate": 0.007271566745657721, + "loss": 3.157, + "mean_token_accuracy": 0.39722058176994324, + "num_tokens": 6114742417.0, + "step": 11962 + }, + { + "epoch": 3.2349918875067605, + "grad_norm": 2.75, + "learning_rate": 0.007270131955113866, + "loss": 2.9958, + "mean_token_accuracy": 0.40613049268722534, + "num_tokens": 6115266589.0, + "step": 11963 + }, + { + "epoch": 3.23526230394808, + "grad_norm": 3.40625, + "learning_rate": 0.007268697279013334, + "loss": 3.1578, + "mean_token_accuracy": 0.3790108561515808, + "num_tokens": 6115790847.0, + "step": 11964 + }, + { + "epoch": 3.2355327203893998, + "grad_norm": 3.375, + "learning_rate": 0.0072672627174001385, + "loss": 3.1796, + "mean_token_accuracy": 0.3991580605506897, + "num_tokens": 6116315117.0, + "step": 11965 + }, + { + "epoch": 3.2358031368307194, + "grad_norm": 2.96875, + "learning_rate": 0.007265828270318304, + "loss": 2.9858, + "mean_token_accuracy": 0.41027942299842834, + "num_tokens": 6116839323.0, + "step": 11966 + }, + { + "epoch": 3.236073553272039, + "grad_norm": 2.90625, + "learning_rate": 0.0072643939378118395, + "loss": 3.1617, + "mean_token_accuracy": 0.3939846158027649, + "num_tokens": 6117363527.0, + "step": 11967 + }, + { + "epoch": 3.2363439697133587, + "grad_norm": 3.0, + "learning_rate": 0.007262959719924751, + "loss": 2.8064, + "mean_token_accuracy": 0.4213414192199707, + "num_tokens": 6117887793.0, + "step": 11968 + }, + { + "epoch": 3.2366143861546783, + "grad_norm": 3.71875, + "learning_rate": 0.007261525616701053, + "loss": 2.8008, + "mean_token_accuracy": 0.417785108089447, + "num_tokens": 6118412022.0, + "step": 11969 + }, + { + "epoch": 3.236884802595998, + "grad_norm": 2.765625, + "learning_rate": 0.007260091628184739, + "loss": 2.7716, + "mean_token_accuracy": 0.43337592482566833, + "num_tokens": 6118936300.0, + "step": 11970 + }, + { + "epoch": 3.2371552190373176, + "grad_norm": 18.75, + "learning_rate": 0.00725865775441981, + "loss": 10.6908, + "mean_token_accuracy": 0.01585981249809265, + "num_tokens": 6119460447.0, + "step": 11971 + }, + { + "epoch": 3.2374256354786373, + "grad_norm": 8.375, + "learning_rate": 0.007257223995450271, + "loss": 2.9024, + "mean_token_accuracy": 0.42207232117652893, + "num_tokens": 6119984719.0, + "step": 11972 + }, + { + "epoch": 3.237696051919957, + "grad_norm": 2.796875, + "learning_rate": 0.007255790351320104, + "loss": 2.9724, + "mean_token_accuracy": 0.42809224128723145, + "num_tokens": 6120508898.0, + "step": 11973 + }, + { + "epoch": 3.2379664683612766, + "grad_norm": 2.765625, + "learning_rate": 0.007254356822073296, + "loss": 2.8822, + "mean_token_accuracy": 0.41112154722213745, + "num_tokens": 6120987800.0, + "step": 11974 + }, + { + "epoch": 3.238236884802596, + "grad_norm": 5.25, + "learning_rate": 0.007252923407753842, + "loss": 2.8729, + "mean_token_accuracy": 0.4567728340625763, + "num_tokens": 6121511925.0, + "step": 11975 + }, + { + "epoch": 3.238507301243916, + "grad_norm": 1.9609375, + "learning_rate": 0.007251490108405714, + "loss": 2.904, + "mean_token_accuracy": 0.4142213463783264, + "num_tokens": 6122036190.0, + "step": 11976 + }, + { + "epoch": 3.2387777176852355, + "grad_norm": 2.578125, + "learning_rate": 0.007250056924072892, + "loss": 3.0202, + "mean_token_accuracy": 0.4232708215713501, + "num_tokens": 6122500494.0, + "step": 11977 + }, + { + "epoch": 3.2390481341265547, + "grad_norm": 2.484375, + "learning_rate": 0.007248623854799355, + "loss": 2.7893, + "mean_token_accuracy": 0.4219469726085663, + "num_tokens": 6123024774.0, + "step": 11978 + }, + { + "epoch": 3.2393185505678743, + "grad_norm": 2.75, + "learning_rate": 0.007247190900629066, + "loss": 2.8662, + "mean_token_accuracy": 0.40222594141960144, + "num_tokens": 6123549041.0, + "step": 11979 + }, + { + "epoch": 3.239588967009194, + "grad_norm": 2.890625, + "learning_rate": 0.007245758061606, + "loss": 3.0843, + "mean_token_accuracy": 0.41487744450569153, + "num_tokens": 6124073310.0, + "step": 11980 + }, + { + "epoch": 3.2398593834505136, + "grad_norm": 2.890625, + "learning_rate": 0.007244325337774114, + "loss": 2.9992, + "mean_token_accuracy": 0.41793763637542725, + "num_tokens": 6124597414.0, + "step": 11981 + }, + { + "epoch": 3.2401297998918333, + "grad_norm": 7.59375, + "learning_rate": 0.007242892729177373, + "loss": 2.8263, + "mean_token_accuracy": 0.44859299063682556, + "num_tokens": 6125111510.0, + "step": 11982 + }, + { + "epoch": 3.240400216333153, + "grad_norm": 2.1875, + "learning_rate": 0.007241460235859732, + "loss": 3.1142, + "mean_token_accuracy": 0.40285998582839966, + "num_tokens": 6125635665.0, + "step": 11983 + }, + { + "epoch": 3.2406706327744725, + "grad_norm": 2.625, + "learning_rate": 0.007240027857865143, + "loss": 2.7127, + "mean_token_accuracy": 0.430854856967926, + "num_tokens": 6126159816.0, + "step": 11984 + }, + { + "epoch": 3.240941049215792, + "grad_norm": 2.578125, + "learning_rate": 0.007238595595237556, + "loss": 2.9146, + "mean_token_accuracy": 0.42146962881088257, + "num_tokens": 6126684073.0, + "step": 11985 + }, + { + "epoch": 3.241211465657112, + "grad_norm": 3.109375, + "learning_rate": 0.007237163448020923, + "loss": 2.9397, + "mean_token_accuracy": 0.40960121154785156, + "num_tokens": 6127208354.0, + "step": 11986 + }, + { + "epoch": 3.2414818820984315, + "grad_norm": 2.609375, + "learning_rate": 0.0072357314162591795, + "loss": 3.0998, + "mean_token_accuracy": 0.4085181951522827, + "num_tokens": 6127732638.0, + "step": 11987 + }, + { + "epoch": 3.241752298539751, + "grad_norm": 4.8125, + "learning_rate": 0.007234299499996259, + "loss": 3.1965, + "mean_token_accuracy": 0.39065712690353394, + "num_tokens": 6128256839.0, + "step": 11988 + }, + { + "epoch": 3.2420227149810708, + "grad_norm": 2.875, + "learning_rate": 0.0072328676992761114, + "loss": 2.9275, + "mean_token_accuracy": 0.42884474992752075, + "num_tokens": 6128722701.0, + "step": 11989 + }, + { + "epoch": 3.2422931314223904, + "grad_norm": 2.9375, + "learning_rate": 0.007231436014142663, + "loss": 3.0629, + "mean_token_accuracy": 0.40519267320632935, + "num_tokens": 6129246923.0, + "step": 11990 + }, + { + "epoch": 3.24256354786371, + "grad_norm": 72.5, + "learning_rate": 0.007230004444639833, + "loss": 17.0744, + "mean_token_accuracy": 0.003144362010061741, + "num_tokens": 6129771115.0, + "step": 11991 + }, + { + "epoch": 3.2428339643050297, + "grad_norm": 6.65625, + "learning_rate": 0.0072285729908115595, + "loss": 3.4544, + "mean_token_accuracy": 0.33748918771743774, + "num_tokens": 6130283219.0, + "step": 11992 + }, + { + "epoch": 3.2431043807463493, + "grad_norm": 2.578125, + "learning_rate": 0.007227141652701757, + "loss": 3.0572, + "mean_token_accuracy": 0.4166980981826782, + "num_tokens": 6130749050.0, + "step": 11993 + }, + { + "epoch": 3.243374797187669, + "grad_norm": 3.046875, + "learning_rate": 0.00722571043035434, + "loss": 3.0583, + "mean_token_accuracy": 0.3804071545600891, + "num_tokens": 6131273118.0, + "step": 11994 + }, + { + "epoch": 3.2436452136289886, + "grad_norm": 3.625, + "learning_rate": 0.0072242793238132255, + "loss": 3.196, + "mean_token_accuracy": 0.40273505449295044, + "num_tokens": 6131797153.0, + "step": 11995 + }, + { + "epoch": 3.2439156300703083, + "grad_norm": 3.28125, + "learning_rate": 0.00722284833312233, + "loss": 2.8064, + "mean_token_accuracy": 0.42253708839416504, + "num_tokens": 6132321275.0, + "step": 11996 + }, + { + "epoch": 3.244186046511628, + "grad_norm": 2.546875, + "learning_rate": 0.00722141745832555, + "loss": 3.139, + "mean_token_accuracy": 0.40452274680137634, + "num_tokens": 6132845489.0, + "step": 11997 + }, + { + "epoch": 3.2444564629529475, + "grad_norm": 3.0625, + "learning_rate": 0.007219986699466801, + "loss": 2.9451, + "mean_token_accuracy": 0.4063345193862915, + "num_tokens": 6133369707.0, + "step": 11998 + }, + { + "epoch": 3.244726879394267, + "grad_norm": 2.734375, + "learning_rate": 0.007218556056589973, + "loss": 2.9016, + "mean_token_accuracy": 0.4254799485206604, + "num_tokens": 6133893871.0, + "step": 11999 + }, + { + "epoch": 3.244997295835587, + "grad_norm": 3.0, + "learning_rate": 0.007217125529738966, + "loss": 2.7927, + "mean_token_accuracy": 0.43472594022750854, + "num_tokens": 6134418122.0, + "step": 12000 + }, + { + "epoch": 3.2452677122769065, + "grad_norm": 2.96875, + "learning_rate": 0.007215695118957672, + "loss": 2.5538, + "mean_token_accuracy": 0.47269347310066223, + "num_tokens": 6134942395.0, + "step": 12001 + }, + { + "epoch": 3.245538128718226, + "grad_norm": 2.28125, + "learning_rate": 0.007214264824289982, + "loss": 2.9595, + "mean_token_accuracy": 0.41280779242515564, + "num_tokens": 6135466619.0, + "step": 12002 + }, + { + "epoch": 3.2458085451595458, + "grad_norm": 2.953125, + "learning_rate": 0.00721283464577978, + "loss": 3.0259, + "mean_token_accuracy": 0.42754751443862915, + "num_tokens": 6135989897.0, + "step": 12003 + }, + { + "epoch": 3.2460789616008654, + "grad_norm": 2.71875, + "learning_rate": 0.007211404583470949, + "loss": 3.1409, + "mean_token_accuracy": 0.3990319073200226, + "num_tokens": 6136514181.0, + "step": 12004 + }, + { + "epoch": 3.246349378042185, + "grad_norm": 2.734375, + "learning_rate": 0.007209974637407366, + "loss": 2.9828, + "mean_token_accuracy": 0.42905646562576294, + "num_tokens": 6136928399.0, + "step": 12005 + }, + { + "epoch": 3.2466197944835047, + "grad_norm": 2.890625, + "learning_rate": 0.007208544807632904, + "loss": 3.0374, + "mean_token_accuracy": 0.41750073432922363, + "num_tokens": 6137415181.0, + "step": 12006 + }, + { + "epoch": 3.2468902109248243, + "grad_norm": 2.53125, + "learning_rate": 0.007207115094191441, + "loss": 2.8256, + "mean_token_accuracy": 0.4260755479335785, + "num_tokens": 6137939368.0, + "step": 12007 + }, + { + "epoch": 3.247160627366144, + "grad_norm": 3.15625, + "learning_rate": 0.0072056854971268396, + "loss": 3.0321, + "mean_token_accuracy": 0.4202714264392853, + "num_tokens": 6138463639.0, + "step": 12008 + }, + { + "epoch": 3.2474310438074636, + "grad_norm": 4.5, + "learning_rate": 0.007204256016482966, + "loss": 3.2341, + "mean_token_accuracy": 0.3853684663772583, + "num_tokens": 6138987834.0, + "step": 12009 + }, + { + "epoch": 3.2477014602487833, + "grad_norm": 3.5, + "learning_rate": 0.007202826652303678, + "loss": 2.9695, + "mean_token_accuracy": 0.42604899406433105, + "num_tokens": 6139501305.0, + "step": 12010 + }, + { + "epoch": 3.247971876690103, + "grad_norm": 176.0, + "learning_rate": 0.007201397404632837, + "loss": 12.7288, + "mean_token_accuracy": 0.0025486520025879145, + "num_tokens": 6140015059.0, + "step": 12011 + }, + { + "epoch": 3.2482422931314225, + "grad_norm": 6.15625, + "learning_rate": 0.007199968273514295, + "loss": 3.1438, + "mean_token_accuracy": 0.36171242594718933, + "num_tokens": 6140539178.0, + "step": 12012 + }, + { + "epoch": 3.248512709572742, + "grad_norm": 2.421875, + "learning_rate": 0.007198539258991903, + "loss": 2.9571, + "mean_token_accuracy": 0.4358023703098297, + "num_tokens": 6141061781.0, + "step": 12013 + }, + { + "epoch": 3.248783126014062, + "grad_norm": 3.484375, + "learning_rate": 0.007197110361109502, + "loss": 3.2089, + "mean_token_accuracy": 0.4127930700778961, + "num_tokens": 6141540905.0, + "step": 12014 + }, + { + "epoch": 3.2490535424553815, + "grad_norm": 2.84375, + "learning_rate": 0.007195681579910942, + "loss": 2.9953, + "mean_token_accuracy": 0.40178382396698, + "num_tokens": 6142065079.0, + "step": 12015 + }, + { + "epoch": 3.249323958896701, + "grad_norm": 3.15625, + "learning_rate": 0.0071942529154400605, + "loss": 2.8424, + "mean_token_accuracy": 0.4366675317287445, + "num_tokens": 6142589230.0, + "step": 12016 + }, + { + "epoch": 3.2495943753380208, + "grad_norm": 3.109375, + "learning_rate": 0.007192824367740686, + "loss": 3.1038, + "mean_token_accuracy": 0.4165576994419098, + "num_tokens": 6143113444.0, + "step": 12017 + }, + { + "epoch": 3.2498647917793404, + "grad_norm": 4.1875, + "learning_rate": 0.00719139593685666, + "loss": 3.0789, + "mean_token_accuracy": 0.40745091438293457, + "num_tokens": 6143637708.0, + "step": 12018 + }, + { + "epoch": 3.2501352082206596, + "grad_norm": 2.375, + "learning_rate": 0.007189967622831808, + "loss": 2.7738, + "mean_token_accuracy": 0.42664074897766113, + "num_tokens": 6144161943.0, + "step": 12019 + }, + { + "epoch": 3.2504056246619797, + "grad_norm": 3.609375, + "learning_rate": 0.00718853942570995, + "loss": 3.0605, + "mean_token_accuracy": 0.4059738516807556, + "num_tokens": 6144686218.0, + "step": 12020 + }, + { + "epoch": 3.250676041103299, + "grad_norm": 2.9375, + "learning_rate": 0.007187111345534916, + "loss": 3.0495, + "mean_token_accuracy": 0.3962317705154419, + "num_tokens": 6145210408.0, + "step": 12021 + }, + { + "epoch": 3.2509464575446185, + "grad_norm": 3.09375, + "learning_rate": 0.0071856833823505115, + "loss": 2.9357, + "mean_token_accuracy": 0.4066314101219177, + "num_tokens": 6145734604.0, + "step": 12022 + }, + { + "epoch": 3.251216873985938, + "grad_norm": 2.578125, + "learning_rate": 0.007184255536200565, + "loss": 2.8534, + "mean_token_accuracy": 0.44512006640434265, + "num_tokens": 6146196501.0, + "step": 12023 + }, + { + "epoch": 3.251487290427258, + "grad_norm": 2.390625, + "learning_rate": 0.007182827807128875, + "loss": 2.9411, + "mean_token_accuracy": 0.4304594099521637, + "num_tokens": 6146706271.0, + "step": 12024 + }, + { + "epoch": 3.2517577068685775, + "grad_norm": 3.0, + "learning_rate": 0.007181400195179257, + "loss": 3.0124, + "mean_token_accuracy": 0.4107150137424469, + "num_tokens": 6147230550.0, + "step": 12025 + }, + { + "epoch": 3.252028123309897, + "grad_norm": 2.359375, + "learning_rate": 0.007179972700395507, + "loss": 2.7785, + "mean_token_accuracy": 0.448448121547699, + "num_tokens": 6147704131.0, + "step": 12026 + }, + { + "epoch": 3.2522985397512167, + "grad_norm": 3.078125, + "learning_rate": 0.007178545322821435, + "loss": 3.1698, + "mean_token_accuracy": 0.40819305181503296, + "num_tokens": 6148215611.0, + "step": 12027 + }, + { + "epoch": 3.2525689561925364, + "grad_norm": 3.296875, + "learning_rate": 0.007177118062500827, + "loss": 3.0132, + "mean_token_accuracy": 0.4156627953052521, + "num_tokens": 6148739894.0, + "step": 12028 + }, + { + "epoch": 3.252839372633856, + "grad_norm": 2.921875, + "learning_rate": 0.007175690919477478, + "loss": 3.0485, + "mean_token_accuracy": 0.41760995984077454, + "num_tokens": 6149264094.0, + "step": 12029 + }, + { + "epoch": 3.2531097890751757, + "grad_norm": 2.671875, + "learning_rate": 0.007174263893795179, + "loss": 2.5832, + "mean_token_accuracy": 0.44342389702796936, + "num_tokens": 6149776117.0, + "step": 12030 + }, + { + "epoch": 3.2533802055164953, + "grad_norm": 12.25, + "learning_rate": 0.007172836985497717, + "loss": 10.355, + "mean_token_accuracy": 0.0002495326625648886, + "num_tokens": 6150239596.0, + "step": 12031 + }, + { + "epoch": 3.253650621957815, + "grad_norm": 6.71875, + "learning_rate": 0.007171410194628866, + "loss": 3.1082, + "mean_token_accuracy": 0.4153279662132263, + "num_tokens": 6150651195.0, + "step": 12032 + }, + { + "epoch": 3.2539210383991346, + "grad_norm": 2.265625, + "learning_rate": 0.007169983521232415, + "loss": 3.0901, + "mean_token_accuracy": 0.4077797532081604, + "num_tokens": 6151175440.0, + "step": 12033 + }, + { + "epoch": 3.2541914548404542, + "grad_norm": 2.265625, + "learning_rate": 0.0071685569653521285, + "loss": 2.9877, + "mean_token_accuracy": 0.40486085414886475, + "num_tokens": 6151699554.0, + "step": 12034 + }, + { + "epoch": 3.254461871281774, + "grad_norm": 17.375, + "learning_rate": 0.007167130527031784, + "loss": 2.7986, + "mean_token_accuracy": 0.4576781988143921, + "num_tokens": 6152212735.0, + "step": 12035 + }, + { + "epoch": 3.2547322877230935, + "grad_norm": 4.375, + "learning_rate": 0.007165704206315149, + "loss": 2.9339, + "mean_token_accuracy": 0.42489075660705566, + "num_tokens": 6152682907.0, + "step": 12036 + }, + { + "epoch": 3.255002704164413, + "grad_norm": 2.765625, + "learning_rate": 0.007164278003245979, + "loss": 3.0379, + "mean_token_accuracy": 0.40636685490608215, + "num_tokens": 6153183471.0, + "step": 12037 + }, + { + "epoch": 3.255273120605733, + "grad_norm": 2.859375, + "learning_rate": 0.007162851917868043, + "loss": 3.1037, + "mean_token_accuracy": 0.3824746608734131, + "num_tokens": 6153666535.0, + "step": 12038 + }, + { + "epoch": 3.2555435370470525, + "grad_norm": 3.734375, + "learning_rate": 0.0071614259502250956, + "loss": 3.0111, + "mean_token_accuracy": 0.4200531244277954, + "num_tokens": 6154164272.0, + "step": 12039 + }, + { + "epoch": 3.255813953488372, + "grad_norm": 3.078125, + "learning_rate": 0.007160000100360886, + "loss": 2.7037, + "mean_token_accuracy": 0.4385424554347992, + "num_tokens": 6154642279.0, + "step": 12040 + }, + { + "epoch": 3.2560843699296917, + "grad_norm": 3.21875, + "learning_rate": 0.007158574368319169, + "loss": 2.6986, + "mean_token_accuracy": 0.45584097504615784, + "num_tokens": 6155136558.0, + "step": 12041 + }, + { + "epoch": 3.2563547863710114, + "grad_norm": 3.875, + "learning_rate": 0.007157148754143687, + "loss": 2.9078, + "mean_token_accuracy": 0.4310910701751709, + "num_tokens": 6155660733.0, + "step": 12042 + }, + { + "epoch": 3.256625202812331, + "grad_norm": 2.46875, + "learning_rate": 0.00715572325787818, + "loss": 2.8369, + "mean_token_accuracy": 0.411385178565979, + "num_tokens": 6156163091.0, + "step": 12043 + }, + { + "epoch": 3.2568956192536507, + "grad_norm": 2.625, + "learning_rate": 0.007154297879566389, + "loss": 2.9999, + "mean_token_accuracy": 0.4021402597427368, + "num_tokens": 6156687360.0, + "step": 12044 + }, + { + "epoch": 3.2571660356949703, + "grad_norm": 2.734375, + "learning_rate": 0.0071528726192520535, + "loss": 3.1679, + "mean_token_accuracy": 0.41008129715919495, + "num_tokens": 6157211606.0, + "step": 12045 + }, + { + "epoch": 3.25743645213629, + "grad_norm": 2.859375, + "learning_rate": 0.007151447476978892, + "loss": 2.944, + "mean_token_accuracy": 0.4159016013145447, + "num_tokens": 6157735870.0, + "step": 12046 + }, + { + "epoch": 3.2577068685776096, + "grad_norm": 2.609375, + "learning_rate": 0.007150022452790645, + "loss": 2.9471, + "mean_token_accuracy": 0.4086260497570038, + "num_tokens": 6158260048.0, + "step": 12047 + }, + { + "epoch": 3.2579772850189292, + "grad_norm": 2.40625, + "learning_rate": 0.007148597546731031, + "loss": 2.7498, + "mean_token_accuracy": 0.4401387870311737, + "num_tokens": 6158784253.0, + "step": 12048 + }, + { + "epoch": 3.258247701460249, + "grad_norm": 2.484375, + "learning_rate": 0.007147172758843768, + "loss": 2.9983, + "mean_token_accuracy": 0.39942896366119385, + "num_tokens": 6159308390.0, + "step": 12049 + }, + { + "epoch": 3.2585181179015685, + "grad_norm": 2.96875, + "learning_rate": 0.007145748089172579, + "loss": 2.9323, + "mean_token_accuracy": 0.43551188707351685, + "num_tokens": 6159795840.0, + "step": 12050 + }, + { + "epoch": 3.258788534342888, + "grad_norm": 16.25, + "learning_rate": 0.007144323537761166, + "loss": 12.4263, + "mean_token_accuracy": 0.012185413390398026, + "num_tokens": 6160287045.0, + "step": 12051 + }, + { + "epoch": 3.259058950784208, + "grad_norm": 5.90625, + "learning_rate": 0.007142899104653249, + "loss": 3.0059, + "mean_token_accuracy": 0.39815282821655273, + "num_tokens": 6160811252.0, + "step": 12052 + }, + { + "epoch": 3.2593293672255275, + "grad_norm": 1.9453125, + "learning_rate": 0.0071414747898925325, + "loss": 2.7769, + "mean_token_accuracy": 0.428896963596344, + "num_tokens": 6161335303.0, + "step": 12053 + }, + { + "epoch": 3.259599783666847, + "grad_norm": 3.25, + "learning_rate": 0.007140050593522718, + "loss": 2.7898, + "mean_token_accuracy": 0.4198627471923828, + "num_tokens": 6161859479.0, + "step": 12054 + }, + { + "epoch": 3.2598702001081667, + "grad_norm": 3.21875, + "learning_rate": 0.007138626515587498, + "loss": 3.0357, + "mean_token_accuracy": 0.4131057858467102, + "num_tokens": 6162383680.0, + "step": 12055 + }, + { + "epoch": 3.2601406165494864, + "grad_norm": 2.640625, + "learning_rate": 0.007137202556130575, + "loss": 2.9093, + "mean_token_accuracy": 0.41297584772109985, + "num_tokens": 6162895348.0, + "step": 12056 + }, + { + "epoch": 3.260411032990806, + "grad_norm": 9.5625, + "learning_rate": 0.007135778715195635, + "loss": 3.0484, + "mean_token_accuracy": 0.44059574604034424, + "num_tokens": 6163419542.0, + "step": 12057 + }, + { + "epoch": 3.2606814494321252, + "grad_norm": 2.921875, + "learning_rate": 0.007134354992826373, + "loss": 2.9594, + "mean_token_accuracy": 0.4124055504798889, + "num_tokens": 6163943816.0, + "step": 12058 + }, + { + "epoch": 3.2609518658734453, + "grad_norm": 2.59375, + "learning_rate": 0.007132931389066466, + "loss": 3.0156, + "mean_token_accuracy": 0.4259635806083679, + "num_tokens": 6164467935.0, + "step": 12059 + }, + { + "epoch": 3.2612222823147645, + "grad_norm": 3.0625, + "learning_rate": 0.007131507903959593, + "loss": 2.9275, + "mean_token_accuracy": 0.3945387005805969, + "num_tokens": 6164992187.0, + "step": 12060 + }, + { + "epoch": 3.2614926987560846, + "grad_norm": 2.375, + "learning_rate": 0.007130084537549437, + "loss": 2.9477, + "mean_token_accuracy": 0.4258772134780884, + "num_tokens": 6165465018.0, + "step": 12061 + }, + { + "epoch": 3.261763115197404, + "grad_norm": 2.765625, + "learning_rate": 0.00712866128987967, + "loss": 2.8993, + "mean_token_accuracy": 0.4295670986175537, + "num_tokens": 6165934797.0, + "step": 12062 + }, + { + "epoch": 3.2620335316387234, + "grad_norm": 2.4375, + "learning_rate": 0.007127238160993954, + "loss": 2.8444, + "mean_token_accuracy": 0.43173497915267944, + "num_tokens": 6166458863.0, + "step": 12063 + }, + { + "epoch": 3.262303948080043, + "grad_norm": 3.25, + "learning_rate": 0.007125815150935964, + "loss": 2.9866, + "mean_token_accuracy": 0.42392152547836304, + "num_tokens": 6166918566.0, + "step": 12064 + }, + { + "epoch": 3.2625743645213627, + "grad_norm": 3.03125, + "learning_rate": 0.007124392259749355, + "loss": 2.9746, + "mean_token_accuracy": 0.4189973771572113, + "num_tokens": 6167442640.0, + "step": 12065 + }, + { + "epoch": 3.2628447809626824, + "grad_norm": 3.3125, + "learning_rate": 0.007122969487477793, + "loss": 2.9998, + "mean_token_accuracy": 0.4104696214199066, + "num_tokens": 6167966897.0, + "step": 12066 + }, + { + "epoch": 3.263115197404002, + "grad_norm": 2.875, + "learning_rate": 0.007121546834164925, + "loss": 2.9071, + "mean_token_accuracy": 0.41231149435043335, + "num_tokens": 6168491157.0, + "step": 12067 + }, + { + "epoch": 3.2633856138453217, + "grad_norm": 3.46875, + "learning_rate": 0.007120124299854409, + "loss": 3.0072, + "mean_token_accuracy": 0.39558207988739014, + "num_tokens": 6169015356.0, + "step": 12068 + }, + { + "epoch": 3.2636560302866413, + "grad_norm": 2.8125, + "learning_rate": 0.007118701884589886, + "loss": 2.9414, + "mean_token_accuracy": 0.4159128665924072, + "num_tokens": 6169539364.0, + "step": 12069 + }, + { + "epoch": 3.263926446727961, + "grad_norm": 3.1875, + "learning_rate": 0.007117279588415007, + "loss": 3.1148, + "mean_token_accuracy": 0.3827345073223114, + "num_tokens": 6170063640.0, + "step": 12070 + }, + { + "epoch": 3.2641968631692806, + "grad_norm": 56.0, + "learning_rate": 0.00711585741137341, + "loss": 21.036, + "mean_token_accuracy": 0.0, + "num_tokens": 6170587920.0, + "step": 12071 + }, + { + "epoch": 3.2644672796106002, + "grad_norm": 6.5, + "learning_rate": 0.007114435353508724, + "loss": 3.1944, + "mean_token_accuracy": 0.3870287537574768, + "num_tokens": 6171112082.0, + "step": 12072 + }, + { + "epoch": 3.26473769605192, + "grad_norm": 2.453125, + "learning_rate": 0.007113013414864593, + "loss": 3.2335, + "mean_token_accuracy": 0.3915331959724426, + "num_tokens": 6171636287.0, + "step": 12073 + }, + { + "epoch": 3.2650081124932395, + "grad_norm": 3.4375, + "learning_rate": 0.007111591595484641, + "loss": 2.9974, + "mean_token_accuracy": 0.4223673641681671, + "num_tokens": 6172158368.0, + "step": 12074 + }, + { + "epoch": 3.265278528934559, + "grad_norm": 3.359375, + "learning_rate": 0.00711016989541249, + "loss": 2.9919, + "mean_token_accuracy": 0.4162105321884155, + "num_tokens": 6172682557.0, + "step": 12075 + }, + { + "epoch": 3.265548945375879, + "grad_norm": 3.34375, + "learning_rate": 0.007108748314691771, + "loss": 2.8898, + "mean_token_accuracy": 0.4195072650909424, + "num_tokens": 6173163079.0, + "step": 12076 + }, + { + "epoch": 3.2658193618171985, + "grad_norm": 15.9375, + "learning_rate": 0.007107326853366096, + "loss": 3.2348, + "mean_token_accuracy": 0.40777790546417236, + "num_tokens": 6173687148.0, + "step": 12077 + }, + { + "epoch": 3.266089778258518, + "grad_norm": 3.078125, + "learning_rate": 0.007105905511479076, + "loss": 3.166, + "mean_token_accuracy": 0.403400719165802, + "num_tokens": 6174211423.0, + "step": 12078 + }, + { + "epoch": 3.2663601946998377, + "grad_norm": 2.25, + "learning_rate": 0.007104484289074329, + "loss": 2.9449, + "mean_token_accuracy": 0.4228280782699585, + "num_tokens": 6174735694.0, + "step": 12079 + }, + { + "epoch": 3.2666306111411574, + "grad_norm": 2.625, + "learning_rate": 0.0071030631861954616, + "loss": 2.8347, + "mean_token_accuracy": 0.4288647770881653, + "num_tokens": 6175257546.0, + "step": 12080 + }, + { + "epoch": 3.266901027582477, + "grad_norm": 2.734375, + "learning_rate": 0.007101642202886074, + "loss": 2.8349, + "mean_token_accuracy": 0.4061757028102875, + "num_tokens": 6175781789.0, + "step": 12081 + }, + { + "epoch": 3.2671714440237967, + "grad_norm": 3.625, + "learning_rate": 0.007100221339189772, + "loss": 2.937, + "mean_token_accuracy": 0.43961215019226074, + "num_tokens": 6176280249.0, + "step": 12082 + }, + { + "epoch": 3.2674418604651163, + "grad_norm": 3.28125, + "learning_rate": 0.007098800595150141, + "loss": 3.0243, + "mean_token_accuracy": 0.42183178663253784, + "num_tokens": 6176804504.0, + "step": 12083 + }, + { + "epoch": 3.267712276906436, + "grad_norm": 3.4375, + "learning_rate": 0.007097379970810787, + "loss": 2.8905, + "mean_token_accuracy": 0.4050107002258301, + "num_tokens": 6177328749.0, + "step": 12084 + }, + { + "epoch": 3.2679826933477556, + "grad_norm": 2.515625, + "learning_rate": 0.007095959466215289, + "loss": 3.0933, + "mean_token_accuracy": 0.4191603660583496, + "num_tokens": 6177852925.0, + "step": 12085 + }, + { + "epoch": 3.2682531097890752, + "grad_norm": 3.84375, + "learning_rate": 0.007094539081407236, + "loss": 3.0372, + "mean_token_accuracy": 0.4100331962108612, + "num_tokens": 6178377197.0, + "step": 12086 + }, + { + "epoch": 3.268523526230395, + "grad_norm": 2.53125, + "learning_rate": 0.007093118816430211, + "loss": 2.9237, + "mean_token_accuracy": 0.407274067401886, + "num_tokens": 6178901416.0, + "step": 12087 + }, + { + "epoch": 3.2687939426717145, + "grad_norm": 3.3125, + "learning_rate": 0.007091698671327789, + "loss": 2.9321, + "mean_token_accuracy": 0.42322421073913574, + "num_tokens": 6179425586.0, + "step": 12088 + }, + { + "epoch": 3.269064359113034, + "grad_norm": 3.34375, + "learning_rate": 0.007090278646143544, + "loss": 3.0669, + "mean_token_accuracy": 0.40758728981018066, + "num_tokens": 6179949749.0, + "step": 12089 + }, + { + "epoch": 3.269334775554354, + "grad_norm": 3.4375, + "learning_rate": 0.007088858740921049, + "loss": 3.0377, + "mean_token_accuracy": 0.4130353331565857, + "num_tokens": 6180456705.0, + "step": 12090 + }, + { + "epoch": 3.2696051919956735, + "grad_norm": 4.4375, + "learning_rate": 0.007087438955703873, + "loss": 10.5308, + "mean_token_accuracy": 0.004743373952805996, + "num_tokens": 6180977814.0, + "step": 12091 + }, + { + "epoch": 3.269875608436993, + "grad_norm": 5.125, + "learning_rate": 0.00708601929053557, + "loss": 2.9403, + "mean_token_accuracy": 0.4351450502872467, + "num_tokens": 6181502032.0, + "step": 12092 + }, + { + "epoch": 3.2701460248783127, + "grad_norm": 2.09375, + "learning_rate": 0.0070845997454597074, + "loss": 3.159, + "mean_token_accuracy": 0.40274760127067566, + "num_tokens": 6182026311.0, + "step": 12093 + }, + { + "epoch": 3.2704164413196324, + "grad_norm": 2.390625, + "learning_rate": 0.0070831803205198395, + "loss": 3.056, + "mean_token_accuracy": 0.39922696352005005, + "num_tokens": 6182550588.0, + "step": 12094 + }, + { + "epoch": 3.270686857760952, + "grad_norm": 3.25, + "learning_rate": 0.007081761015759516, + "loss": 3.1945, + "mean_token_accuracy": 0.3880303204059601, + "num_tokens": 6183074729.0, + "step": 12095 + }, + { + "epoch": 3.2709572742022717, + "grad_norm": 2.8125, + "learning_rate": 0.007080341831222292, + "loss": 3.091, + "mean_token_accuracy": 0.434467077255249, + "num_tokens": 6183536395.0, + "step": 12096 + }, + { + "epoch": 3.2712276906435913, + "grad_norm": 3.5, + "learning_rate": 0.007078922766951705, + "loss": 3.0688, + "mean_token_accuracy": 0.4047596752643585, + "num_tokens": 6184060577.0, + "step": 12097 + }, + { + "epoch": 3.271498107084911, + "grad_norm": 3.53125, + "learning_rate": 0.0070775038229912955, + "loss": 2.9313, + "mean_token_accuracy": 0.42317867279052734, + "num_tokens": 6184584753.0, + "step": 12098 + }, + { + "epoch": 3.27176852352623, + "grad_norm": 3.625, + "learning_rate": 0.007076084999384609, + "loss": 3.054, + "mean_token_accuracy": 0.41670241951942444, + "num_tokens": 6185088035.0, + "step": 12099 + }, + { + "epoch": 3.2720389399675502, + "grad_norm": 2.921875, + "learning_rate": 0.007074666296175176, + "loss": 2.7202, + "mean_token_accuracy": 0.46092841029167175, + "num_tokens": 6185612200.0, + "step": 12100 + }, + { + "epoch": 3.2723093564088694, + "grad_norm": 2.265625, + "learning_rate": 0.007073247713406519, + "loss": 3.0531, + "mean_token_accuracy": 0.43343308568000793, + "num_tokens": 6186073220.0, + "step": 12101 + }, + { + "epoch": 3.2725797728501895, + "grad_norm": 3.046875, + "learning_rate": 0.0070718292511221725, + "loss": 3.0147, + "mean_token_accuracy": 0.420114666223526, + "num_tokens": 6186561795.0, + "step": 12102 + }, + { + "epoch": 3.2728501892915087, + "grad_norm": 2.828125, + "learning_rate": 0.007070410909365652, + "loss": 2.884, + "mean_token_accuracy": 0.4196300208568573, + "num_tokens": 6187085826.0, + "step": 12103 + }, + { + "epoch": 3.2731206057328284, + "grad_norm": 3.3125, + "learning_rate": 0.007068992688180489, + "loss": 3.0648, + "mean_token_accuracy": 0.4175497889518738, + "num_tokens": 6187610100.0, + "step": 12104 + }, + { + "epoch": 3.273391022174148, + "grad_norm": 3.03125, + "learning_rate": 0.007067574587610188, + "loss": 2.9185, + "mean_token_accuracy": 0.41384583711624146, + "num_tokens": 6188134293.0, + "step": 12105 + }, + { + "epoch": 3.2736614386154677, + "grad_norm": 3.296875, + "learning_rate": 0.007066156607698259, + "loss": 2.9471, + "mean_token_accuracy": 0.4195076525211334, + "num_tokens": 6188658551.0, + "step": 12106 + }, + { + "epoch": 3.2739318550567873, + "grad_norm": 3.328125, + "learning_rate": 0.007064738748488219, + "loss": 3.1723, + "mean_token_accuracy": 0.39747193455696106, + "num_tokens": 6189182727.0, + "step": 12107 + }, + { + "epoch": 3.274202271498107, + "grad_norm": 3.359375, + "learning_rate": 0.007063321010023563, + "loss": 3.0244, + "mean_token_accuracy": 0.42602038383483887, + "num_tokens": 6189661457.0, + "step": 12108 + }, + { + "epoch": 3.2744726879394266, + "grad_norm": 3.109375, + "learning_rate": 0.007061903392347797, + "loss": 3.052, + "mean_token_accuracy": 0.41956087946891785, + "num_tokens": 6190185734.0, + "step": 12109 + }, + { + "epoch": 3.2747431043807462, + "grad_norm": 3.75, + "learning_rate": 0.007060485895504416, + "loss": 2.8147, + "mean_token_accuracy": 0.4161766767501831, + "num_tokens": 6190709885.0, + "step": 12110 + }, + { + "epoch": 3.275013520822066, + "grad_norm": 8.3125, + "learning_rate": 0.007059068519536915, + "loss": 10.1647, + "mean_token_accuracy": 0.02613082528114319, + "num_tokens": 6191233961.0, + "step": 12111 + }, + { + "epoch": 3.2752839372633855, + "grad_norm": 7.21875, + "learning_rate": 0.007057651264488778, + "loss": 3.2652, + "mean_token_accuracy": 0.39525607228279114, + "num_tokens": 6191698035.0, + "step": 12112 + }, + { + "epoch": 3.275554353704705, + "grad_norm": 2.1875, + "learning_rate": 0.007056234130403496, + "loss": 3.0026, + "mean_token_accuracy": 0.41663670539855957, + "num_tokens": 6192184061.0, + "step": 12113 + }, + { + "epoch": 3.275824770146025, + "grad_norm": 3.09375, + "learning_rate": 0.00705481711732455, + "loss": 3.0359, + "mean_token_accuracy": 0.42024675011634827, + "num_tokens": 6192671219.0, + "step": 12114 + }, + { + "epoch": 3.2760951865873444, + "grad_norm": 68.0, + "learning_rate": 0.007053400225295415, + "loss": 3.6905, + "mean_token_accuracy": 0.3967573046684265, + "num_tokens": 6193166322.0, + "step": 12115 + }, + { + "epoch": 3.276365603028664, + "grad_norm": 5.125, + "learning_rate": 0.007051983454359568, + "loss": 3.0801, + "mean_token_accuracy": 0.39596301317214966, + "num_tokens": 6193690598.0, + "step": 12116 + }, + { + "epoch": 3.2766360194699837, + "grad_norm": 2.25, + "learning_rate": 0.007050566804560481, + "loss": 2.9344, + "mean_token_accuracy": 0.4130922257900238, + "num_tokens": 6194214826.0, + "step": 12117 + }, + { + "epoch": 3.2769064359113034, + "grad_norm": 2.640625, + "learning_rate": 0.007049150275941614, + "loss": 2.9098, + "mean_token_accuracy": 0.4223825931549072, + "num_tokens": 6194739000.0, + "step": 12118 + }, + { + "epoch": 3.277176852352623, + "grad_norm": 3.0625, + "learning_rate": 0.00704773386854644, + "loss": 2.981, + "mean_token_accuracy": 0.40957072377204895, + "num_tokens": 6195263245.0, + "step": 12119 + }, + { + "epoch": 3.2774472687939427, + "grad_norm": 2.515625, + "learning_rate": 0.007046317582418411, + "loss": 2.956, + "mean_token_accuracy": 0.42982780933380127, + "num_tokens": 6195787398.0, + "step": 12120 + }, + { + "epoch": 3.2777176852352623, + "grad_norm": 3.046875, + "learning_rate": 0.007044901417600986, + "loss": 3.1441, + "mean_token_accuracy": 0.3920145034790039, + "num_tokens": 6196311515.0, + "step": 12121 + }, + { + "epoch": 3.277988101676582, + "grad_norm": 2.609375, + "learning_rate": 0.0070434853741376145, + "loss": 2.9056, + "mean_token_accuracy": 0.40048104524612427, + "num_tokens": 6196835783.0, + "step": 12122 + }, + { + "epoch": 3.2782585181179016, + "grad_norm": 3.046875, + "learning_rate": 0.007042069452071751, + "loss": 3.0463, + "mean_token_accuracy": 0.40337491035461426, + "num_tokens": 6197360035.0, + "step": 12123 + }, + { + "epoch": 3.2785289345592212, + "grad_norm": 3.21875, + "learning_rate": 0.007040653651446831, + "loss": 2.9784, + "mean_token_accuracy": 0.41986531019210815, + "num_tokens": 6197884239.0, + "step": 12124 + }, + { + "epoch": 3.278799351000541, + "grad_norm": 2.9375, + "learning_rate": 0.007039237972306307, + "loss": 2.8782, + "mean_token_accuracy": 0.42837685346603394, + "num_tokens": 6198408435.0, + "step": 12125 + }, + { + "epoch": 3.2790697674418605, + "grad_norm": 3.4375, + "learning_rate": 0.007037822414693607, + "loss": 3.1578, + "mean_token_accuracy": 0.4000740051269531, + "num_tokens": 6198932592.0, + "step": 12126 + }, + { + "epoch": 3.27934018388318, + "grad_norm": 3.171875, + "learning_rate": 0.007036406978652168, + "loss": 2.9989, + "mean_token_accuracy": 0.40929582715034485, + "num_tokens": 6199456770.0, + "step": 12127 + }, + { + "epoch": 3.2796106003245, + "grad_norm": 3.734375, + "learning_rate": 0.007034991664225419, + "loss": 2.9667, + "mean_token_accuracy": 0.4315922260284424, + "num_tokens": 6199975150.0, + "step": 12128 + }, + { + "epoch": 3.2798810167658194, + "grad_norm": 3.15625, + "learning_rate": 0.007033576471456784, + "loss": 3.076, + "mean_token_accuracy": 0.40974995493888855, + "num_tokens": 6200470454.0, + "step": 12129 + }, + { + "epoch": 3.280151433207139, + "grad_norm": 2.921875, + "learning_rate": 0.00703216140038969, + "loss": 3.0619, + "mean_token_accuracy": 0.41440489888191223, + "num_tokens": 6200994627.0, + "step": 12130 + }, + { + "epoch": 3.2804218496484587, + "grad_norm": 8.75, + "learning_rate": 0.007030746451067553, + "loss": 11.3706, + "mean_token_accuracy": 0.009216832928359509, + "num_tokens": 6201496018.0, + "step": 12131 + }, + { + "epoch": 3.2806922660897784, + "grad_norm": 6.0625, + "learning_rate": 0.007029331623533783, + "loss": 3.1415, + "mean_token_accuracy": 0.3995131254196167, + "num_tokens": 6202016530.0, + "step": 12132 + }, + { + "epoch": 3.280962682531098, + "grad_norm": 2.234375, + "learning_rate": 0.007027916917831802, + "loss": 3.001, + "mean_token_accuracy": 0.41376665234565735, + "num_tokens": 6202530296.0, + "step": 12133 + }, + { + "epoch": 3.2812330989724177, + "grad_norm": 2.328125, + "learning_rate": 0.007026502334005012, + "loss": 2.9048, + "mean_token_accuracy": 0.41446566581726074, + "num_tokens": 6203054323.0, + "step": 12134 + }, + { + "epoch": 3.2815035154137373, + "grad_norm": 3.15625, + "learning_rate": 0.007025087872096809, + "loss": 3.0392, + "mean_token_accuracy": 0.4057932198047638, + "num_tokens": 6203555043.0, + "step": 12135 + }, + { + "epoch": 3.281773931855057, + "grad_norm": 3.265625, + "learning_rate": 0.0070236735321506025, + "loss": 3.2296, + "mean_token_accuracy": 0.38445693254470825, + "num_tokens": 6204079308.0, + "step": 12136 + }, + { + "epoch": 3.2820443482963766, + "grad_norm": 3.8125, + "learning_rate": 0.007022259314209784, + "loss": 3.0964, + "mean_token_accuracy": 0.40057915449142456, + "num_tokens": 6204603422.0, + "step": 12137 + }, + { + "epoch": 3.2823147647376962, + "grad_norm": 2.59375, + "learning_rate": 0.007020845218317749, + "loss": 2.9395, + "mean_token_accuracy": 0.4233008325099945, + "num_tokens": 6205116997.0, + "step": 12138 + }, + { + "epoch": 3.282585181179016, + "grad_norm": 3.453125, + "learning_rate": 0.007019431244517886, + "loss": 2.9076, + "mean_token_accuracy": 0.4336100220680237, + "num_tokens": 6205641142.0, + "step": 12139 + }, + { + "epoch": 3.282855597620335, + "grad_norm": 3.515625, + "learning_rate": 0.007018017392853579, + "loss": 2.8797, + "mean_token_accuracy": 0.4136478006839752, + "num_tokens": 6206165289.0, + "step": 12140 + }, + { + "epoch": 3.283126014061655, + "grad_norm": 3.078125, + "learning_rate": 0.007016603663368204, + "loss": 2.8452, + "mean_token_accuracy": 0.4322527050971985, + "num_tokens": 6206689492.0, + "step": 12141 + }, + { + "epoch": 3.2833964305029744, + "grad_norm": 3.921875, + "learning_rate": 0.007015190056105149, + "loss": 2.9305, + "mean_token_accuracy": 0.41835713386535645, + "num_tokens": 6207213647.0, + "step": 12142 + }, + { + "epoch": 3.2836668469442944, + "grad_norm": 3.40625, + "learning_rate": 0.007013776571107779, + "loss": 3.1119, + "mean_token_accuracy": 0.42333757877349854, + "num_tokens": 6207737872.0, + "step": 12143 + }, + { + "epoch": 3.2839372633856136, + "grad_norm": 3.421875, + "learning_rate": 0.0070123632084194655, + "loss": 2.9441, + "mean_token_accuracy": 0.40996965765953064, + "num_tokens": 6208261794.0, + "step": 12144 + }, + { + "epoch": 3.2842076798269333, + "grad_norm": 3.125, + "learning_rate": 0.0070109499680835775, + "loss": 2.9934, + "mean_token_accuracy": 0.41884303092956543, + "num_tokens": 6208786068.0, + "step": 12145 + }, + { + "epoch": 3.284478096268253, + "grad_norm": 3.03125, + "learning_rate": 0.007009536850143477, + "loss": 2.9145, + "mean_token_accuracy": 0.42809993028640747, + "num_tokens": 6209275628.0, + "step": 12146 + }, + { + "epoch": 3.2847485127095726, + "grad_norm": 2.40625, + "learning_rate": 0.0070081238546425145, + "loss": 2.9961, + "mean_token_accuracy": 0.44103115797042847, + "num_tokens": 6209799867.0, + "step": 12147 + }, + { + "epoch": 3.285018929150892, + "grad_norm": 3.171875, + "learning_rate": 0.007006710981624058, + "loss": 2.8141, + "mean_token_accuracy": 0.432720422744751, + "num_tokens": 6210324090.0, + "step": 12148 + }, + { + "epoch": 3.285289345592212, + "grad_norm": 3.0, + "learning_rate": 0.007005298231131447, + "loss": 2.8064, + "mean_token_accuracy": 0.42467477917671204, + "num_tokens": 6210830585.0, + "step": 12149 + }, + { + "epoch": 3.2855597620335315, + "grad_norm": 3.484375, + "learning_rate": 0.007003885603208038, + "loss": 2.9268, + "mean_token_accuracy": 0.4245637357234955, + "num_tokens": 6211354834.0, + "step": 12150 + }, + { + "epoch": 3.285830178474851, + "grad_norm": 18.0, + "learning_rate": 0.007002473097897168, + "loss": 11.6145, + "mean_token_accuracy": 0.0, + "num_tokens": 6211807364.0, + "step": 12151 + }, + { + "epoch": 3.286100594916171, + "grad_norm": 4.78125, + "learning_rate": 0.007001060715242178, + "loss": 3.2181, + "mean_token_accuracy": 0.36607933044433594, + "num_tokens": 6212331570.0, + "step": 12152 + }, + { + "epoch": 3.2863710113574904, + "grad_norm": 1.7109375, + "learning_rate": 0.006999648455286409, + "loss": 2.9539, + "mean_token_accuracy": 0.41882845759391785, + "num_tokens": 6212855601.0, + "step": 12153 + }, + { + "epoch": 3.28664142779881, + "grad_norm": 3.140625, + "learning_rate": 0.00699823631807319, + "loss": 2.8721, + "mean_token_accuracy": 0.44437041878700256, + "num_tokens": 6213379847.0, + "step": 12154 + }, + { + "epoch": 3.2869118442401297, + "grad_norm": 2.421875, + "learning_rate": 0.006996824303645846, + "loss": 3.1235, + "mean_token_accuracy": 0.41842180490493774, + "num_tokens": 6213866367.0, + "step": 12155 + }, + { + "epoch": 3.2871822606814494, + "grad_norm": 2.9375, + "learning_rate": 0.006995412412047707, + "loss": 3.0057, + "mean_token_accuracy": 0.4189992845058441, + "num_tokens": 6214390590.0, + "step": 12156 + }, + { + "epoch": 3.287452677122769, + "grad_norm": 3.140625, + "learning_rate": 0.006994000643322093, + "loss": 3.0943, + "mean_token_accuracy": 0.40623778104782104, + "num_tokens": 6214914663.0, + "step": 12157 + }, + { + "epoch": 3.2877230935640886, + "grad_norm": 3.4375, + "learning_rate": 0.006992588997512317, + "loss": 2.9673, + "mean_token_accuracy": 0.4083501398563385, + "num_tokens": 6215438859.0, + "step": 12158 + }, + { + "epoch": 3.2879935100054083, + "grad_norm": 2.796875, + "learning_rate": 0.006991177474661698, + "loss": 2.9866, + "mean_token_accuracy": 0.42718854546546936, + "num_tokens": 6215963120.0, + "step": 12159 + }, + { + "epoch": 3.288263926446728, + "grad_norm": 3.078125, + "learning_rate": 0.006989766074813544, + "loss": 3.1143, + "mean_token_accuracy": 0.41662001609802246, + "num_tokens": 6216487310.0, + "step": 12160 + }, + { + "epoch": 3.2885343428880476, + "grad_norm": 2.96875, + "learning_rate": 0.006988354798011156, + "loss": 2.8985, + "mean_token_accuracy": 0.434156596660614, + "num_tokens": 6217011490.0, + "step": 12161 + }, + { + "epoch": 3.2888047593293672, + "grad_norm": 2.890625, + "learning_rate": 0.006986943644297845, + "loss": 2.9965, + "mean_token_accuracy": 0.4090389907360077, + "num_tokens": 6217535745.0, + "step": 12162 + }, + { + "epoch": 3.289075175770687, + "grad_norm": 2.734375, + "learning_rate": 0.006985532613716903, + "loss": 2.8658, + "mean_token_accuracy": 0.4268983006477356, + "num_tokens": 6218050009.0, + "step": 12163 + }, + { + "epoch": 3.2893455922120065, + "grad_norm": 2.5625, + "learning_rate": 0.006984121706311625, + "loss": 2.8856, + "mean_token_accuracy": 0.4377157688140869, + "num_tokens": 6218566597.0, + "step": 12164 + }, + { + "epoch": 3.289616008653326, + "grad_norm": 3.203125, + "learning_rate": 0.006982710922125302, + "loss": 3.066, + "mean_token_accuracy": 0.4202680289745331, + "num_tokens": 6219090859.0, + "step": 12165 + }, + { + "epoch": 3.289886425094646, + "grad_norm": 3.171875, + "learning_rate": 0.006981300261201224, + "loss": 2.9472, + "mean_token_accuracy": 0.4339773654937744, + "num_tokens": 6219585956.0, + "step": 12166 + }, + { + "epoch": 3.2901568415359654, + "grad_norm": 2.96875, + "learning_rate": 0.00697988972358267, + "loss": 2.9989, + "mean_token_accuracy": 0.4336380064487457, + "num_tokens": 6220110205.0, + "step": 12167 + }, + { + "epoch": 3.290427257977285, + "grad_norm": 2.953125, + "learning_rate": 0.006978479309312924, + "loss": 2.8071, + "mean_token_accuracy": 0.4211946725845337, + "num_tokens": 6220634403.0, + "step": 12168 + }, + { + "epoch": 3.2906976744186047, + "grad_norm": 3.140625, + "learning_rate": 0.00697706901843526, + "loss": 2.9095, + "mean_token_accuracy": 0.40864482522010803, + "num_tokens": 6221158654.0, + "step": 12169 + }, + { + "epoch": 3.2909680908599244, + "grad_norm": 23.875, + "learning_rate": 0.006975658850992947, + "loss": 3.0097, + "mean_token_accuracy": 0.4447512924671173, + "num_tokens": 6221682921.0, + "step": 12170 + }, + { + "epoch": 3.291238507301244, + "grad_norm": 84.0, + "learning_rate": 0.006974248807029257, + "loss": 9.7215, + "mean_token_accuracy": 0.007788957562297583, + "num_tokens": 6222207062.0, + "step": 12171 + }, + { + "epoch": 3.2915089237425637, + "grad_norm": 7.5, + "learning_rate": 0.0069728388865874505, + "loss": 3.3938, + "mean_token_accuracy": 0.34901073575019836, + "num_tokens": 6222731318.0, + "step": 12172 + }, + { + "epoch": 3.2917793401838833, + "grad_norm": 2.359375, + "learning_rate": 0.006971429089710794, + "loss": 3.1654, + "mean_token_accuracy": 0.40508219599723816, + "num_tokens": 6223255597.0, + "step": 12173 + }, + { + "epoch": 3.292049756625203, + "grad_norm": 2.625, + "learning_rate": 0.006970019416442538, + "loss": 2.8029, + "mean_token_accuracy": 0.4342557191848755, + "num_tokens": 6223720337.0, + "step": 12174 + }, + { + "epoch": 3.2923201730665226, + "grad_norm": 2.671875, + "learning_rate": 0.0069686098668259346, + "loss": 2.9897, + "mean_token_accuracy": 0.4112986922264099, + "num_tokens": 6224244529.0, + "step": 12175 + }, + { + "epoch": 3.2925905895078422, + "grad_norm": 2.96875, + "learning_rate": 0.00696720044090424, + "loss": 3.0816, + "mean_token_accuracy": 0.41356000304222107, + "num_tokens": 6224768664.0, + "step": 12176 + }, + { + "epoch": 3.292861005949162, + "grad_norm": 3.296875, + "learning_rate": 0.006965791138720696, + "loss": 2.9841, + "mean_token_accuracy": 0.4218028783798218, + "num_tokens": 6225284142.0, + "step": 12177 + }, + { + "epoch": 3.2931314223904815, + "grad_norm": 3.375, + "learning_rate": 0.0069643819603185395, + "loss": 3.0903, + "mean_token_accuracy": 0.4092162847518921, + "num_tokens": 6225808406.0, + "step": 12178 + }, + { + "epoch": 3.293401838831801, + "grad_norm": 3.78125, + "learning_rate": 0.006962972905741017, + "loss": 3.1239, + "mean_token_accuracy": 0.4107097387313843, + "num_tokens": 6226332642.0, + "step": 12179 + }, + { + "epoch": 3.293672255273121, + "grad_norm": 3.25, + "learning_rate": 0.006961563975031356, + "loss": 3.0751, + "mean_token_accuracy": 0.3830993175506592, + "num_tokens": 6226856924.0, + "step": 12180 + }, + { + "epoch": 3.29394267171444, + "grad_norm": 3.5, + "learning_rate": 0.0069601551682327845, + "loss": 3.0561, + "mean_token_accuracy": 0.4268529713153839, + "num_tokens": 6227381199.0, + "step": 12181 + }, + { + "epoch": 3.29421308815576, + "grad_norm": 2.59375, + "learning_rate": 0.006958746485388538, + "loss": 2.9123, + "mean_token_accuracy": 0.42499324679374695, + "num_tokens": 6227905430.0, + "step": 12182 + }, + { + "epoch": 3.2944835045970793, + "grad_norm": 2.734375, + "learning_rate": 0.006957337926541834, + "loss": 2.8033, + "mean_token_accuracy": 0.44151556491851807, + "num_tokens": 6228429629.0, + "step": 12183 + }, + { + "epoch": 3.2947539210383994, + "grad_norm": 2.90625, + "learning_rate": 0.006955929491735889, + "loss": 2.966, + "mean_token_accuracy": 0.4277133047580719, + "num_tokens": 6228902027.0, + "step": 12184 + }, + { + "epoch": 3.2950243374797186, + "grad_norm": 3.203125, + "learning_rate": 0.006954521181013921, + "loss": 2.81, + "mean_token_accuracy": 0.4316186308860779, + "num_tokens": 6229426230.0, + "step": 12185 + }, + { + "epoch": 3.295294753921038, + "grad_norm": 2.40625, + "learning_rate": 0.006953112994419142, + "loss": 2.8199, + "mean_token_accuracy": 0.47092366218566895, + "num_tokens": 6229886060.0, + "step": 12186 + }, + { + "epoch": 3.295565170362358, + "grad_norm": 3.21875, + "learning_rate": 0.006951704931994753, + "loss": 3.0951, + "mean_token_accuracy": 0.3992003798484802, + "num_tokens": 6230410304.0, + "step": 12187 + }, + { + "epoch": 3.2958355868036775, + "grad_norm": 2.828125, + "learning_rate": 0.006950296993783966, + "loss": 2.8955, + "mean_token_accuracy": 0.40473994612693787, + "num_tokens": 6230934545.0, + "step": 12188 + }, + { + "epoch": 3.296106003244997, + "grad_norm": 2.75, + "learning_rate": 0.006948889179829976, + "loss": 2.9605, + "mean_token_accuracy": 0.4107118248939514, + "num_tokens": 6231458817.0, + "step": 12189 + }, + { + "epoch": 3.296376419686317, + "grad_norm": 2.78125, + "learning_rate": 0.006947481490175977, + "loss": 2.9162, + "mean_token_accuracy": 0.42511922121047974, + "num_tokens": 6231983043.0, + "step": 12190 + }, + { + "epoch": 3.2966468361276364, + "grad_norm": 98.0, + "learning_rate": 0.006946073924865166, + "loss": 11.3066, + "mean_token_accuracy": 0.002000915352255106, + "num_tokens": 6232507311.0, + "step": 12191 + }, + { + "epoch": 3.296917252568956, + "grad_norm": 6.90625, + "learning_rate": 0.0069446664839407255, + "loss": 3.065, + "mean_token_accuracy": 0.38560640811920166, + "num_tokens": 6233031566.0, + "step": 12192 + }, + { + "epoch": 3.2971876690102757, + "grad_norm": 1.765625, + "learning_rate": 0.0069432591674458465, + "loss": 2.9671, + "mean_token_accuracy": 0.4133170247077942, + "num_tokens": 6233555810.0, + "step": 12193 + }, + { + "epoch": 3.2974580854515954, + "grad_norm": 2.296875, + "learning_rate": 0.006941851975423704, + "loss": 2.8882, + "mean_token_accuracy": 0.42304670810699463, + "num_tokens": 6234080010.0, + "step": 12194 + }, + { + "epoch": 3.297728501892915, + "grad_norm": 2.28125, + "learning_rate": 0.006940444907917476, + "loss": 2.959, + "mean_token_accuracy": 0.4386351704597473, + "num_tokens": 6234604281.0, + "step": 12195 + }, + { + "epoch": 3.2979989183342346, + "grad_norm": 2.875, + "learning_rate": 0.006939037964970341, + "loss": 2.9639, + "mean_token_accuracy": 0.4271448850631714, + "num_tokens": 6235128448.0, + "step": 12196 + }, + { + "epoch": 3.2982693347755543, + "grad_norm": 3.234375, + "learning_rate": 0.006937631146625462, + "loss": 3.0462, + "mean_token_accuracy": 0.41097450256347656, + "num_tokens": 6235652553.0, + "step": 12197 + }, + { + "epoch": 3.298539751216874, + "grad_norm": 4.3125, + "learning_rate": 0.006936224452926005, + "loss": 2.923, + "mean_token_accuracy": 0.4368421733379364, + "num_tokens": 6236169238.0, + "step": 12198 + }, + { + "epoch": 3.2988101676581936, + "grad_norm": 3.171875, + "learning_rate": 0.006934817883915137, + "loss": 2.9645, + "mean_token_accuracy": 0.40935632586479187, + "num_tokens": 6236669994.0, + "step": 12199 + }, + { + "epoch": 3.299080584099513, + "grad_norm": 2.84375, + "learning_rate": 0.006933411439636009, + "loss": 2.7775, + "mean_token_accuracy": 0.4459790885448456, + "num_tokens": 6237182746.0, + "step": 12200 + }, + { + "epoch": 3.299351000540833, + "grad_norm": 2.828125, + "learning_rate": 0.006932005120131772, + "loss": 3.0205, + "mean_token_accuracy": 0.38751596212387085, + "num_tokens": 6237707014.0, + "step": 12201 + }, + { + "epoch": 3.2996214169821525, + "grad_norm": 3.171875, + "learning_rate": 0.006930598925445586, + "loss": 2.9165, + "mean_token_accuracy": 0.4257115125656128, + "num_tokens": 6238231288.0, + "step": 12202 + }, + { + "epoch": 3.299891833423472, + "grad_norm": 3.0625, + "learning_rate": 0.006929192855620594, + "loss": 3.021, + "mean_token_accuracy": 0.42256057262420654, + "num_tokens": 6238755538.0, + "step": 12203 + }, + { + "epoch": 3.300162249864792, + "grad_norm": 3.59375, + "learning_rate": 0.006927786910699931, + "loss": 2.9462, + "mean_token_accuracy": 0.4163067042827606, + "num_tokens": 6239279723.0, + "step": 12204 + }, + { + "epoch": 3.3004326663061114, + "grad_norm": 2.828125, + "learning_rate": 0.006926381090726745, + "loss": 2.8378, + "mean_token_accuracy": 0.43778273463249207, + "num_tokens": 6239803895.0, + "step": 12205 + }, + { + "epoch": 3.300703082747431, + "grad_norm": 3.28125, + "learning_rate": 0.006924975395744168, + "loss": 3.1281, + "mean_token_accuracy": 0.4087563157081604, + "num_tokens": 6240328047.0, + "step": 12206 + }, + { + "epoch": 3.3009734991887507, + "grad_norm": 3.15625, + "learning_rate": 0.006923569825795325, + "loss": 2.9116, + "mean_token_accuracy": 0.43002861738204956, + "num_tokens": 6240838159.0, + "step": 12207 + }, + { + "epoch": 3.3012439156300704, + "grad_norm": 3.25, + "learning_rate": 0.006922164380923348, + "loss": 3.0641, + "mean_token_accuracy": 0.40095221996307373, + "num_tokens": 6241362425.0, + "step": 12208 + }, + { + "epoch": 3.30151433207139, + "grad_norm": 3.421875, + "learning_rate": 0.006920759061171364, + "loss": 3.0136, + "mean_token_accuracy": 0.43990960717201233, + "num_tokens": 6241825522.0, + "step": 12209 + }, + { + "epoch": 3.3017847485127096, + "grad_norm": 2.9375, + "learning_rate": 0.006919353866582485, + "loss": 3.0004, + "mean_token_accuracy": 0.4028366804122925, + "num_tokens": 6242349804.0, + "step": 12210 + }, + { + "epoch": 3.3020551649540293, + "grad_norm": 1.640625, + "learning_rate": 0.006917948797199833, + "loss": 11.12, + "mean_token_accuracy": 0.0, + "num_tokens": 6242874078.0, + "step": 12211 + }, + { + "epoch": 3.302325581395349, + "grad_norm": 7.71875, + "learning_rate": 0.006916543853066517, + "loss": 3.1502, + "mean_token_accuracy": 0.4020403027534485, + "num_tokens": 6243370257.0, + "step": 12212 + }, + { + "epoch": 3.3025959978366686, + "grad_norm": 2.140625, + "learning_rate": 0.0069151390342256395, + "loss": 2.8418, + "mean_token_accuracy": 0.4838287830352783, + "num_tokens": 6243894500.0, + "step": 12213 + }, + { + "epoch": 3.302866414277988, + "grad_norm": 2.65625, + "learning_rate": 0.006913734340720313, + "loss": 3.0995, + "mean_token_accuracy": 0.3983888030052185, + "num_tokens": 6244418653.0, + "step": 12214 + }, + { + "epoch": 3.303136830719308, + "grad_norm": 2.953125, + "learning_rate": 0.006912329772593634, + "loss": 3.059, + "mean_token_accuracy": 0.4024788439273834, + "num_tokens": 6244938207.0, + "step": 12215 + }, + { + "epoch": 3.3034072471606275, + "grad_norm": 3.65625, + "learning_rate": 0.006910925329888695, + "loss": 2.8634, + "mean_token_accuracy": 0.41664987802505493, + "num_tokens": 6245462341.0, + "step": 12216 + }, + { + "epoch": 3.303677663601947, + "grad_norm": 2.984375, + "learning_rate": 0.006909521012648597, + "loss": 2.8497, + "mean_token_accuracy": 0.430944561958313, + "num_tokens": 6245986332.0, + "step": 12217 + }, + { + "epoch": 3.303948080043267, + "grad_norm": 3.671875, + "learning_rate": 0.006908116820916418, + "loss": 3.0088, + "mean_token_accuracy": 0.4055400490760803, + "num_tokens": 6246510487.0, + "step": 12218 + }, + { + "epoch": 3.3042184964845864, + "grad_norm": 3.015625, + "learning_rate": 0.006906712754735252, + "loss": 2.979, + "mean_token_accuracy": 0.41745713353157043, + "num_tokens": 6247034761.0, + "step": 12219 + }, + { + "epoch": 3.304488912925906, + "grad_norm": 3.296875, + "learning_rate": 0.006905308814148178, + "loss": 2.9862, + "mean_token_accuracy": 0.408743679523468, + "num_tokens": 6247558960.0, + "step": 12220 + }, + { + "epoch": 3.3047593293672257, + "grad_norm": 3.546875, + "learning_rate": 0.006903904999198266, + "loss": 3.0879, + "mean_token_accuracy": 0.42368555068969727, + "num_tokens": 6248019801.0, + "step": 12221 + }, + { + "epoch": 3.305029745808545, + "grad_norm": 3.53125, + "learning_rate": 0.006902501309928596, + "loss": 2.8777, + "mean_token_accuracy": 0.41925352811813354, + "num_tokens": 6248524286.0, + "step": 12222 + }, + { + "epoch": 3.305300162249865, + "grad_norm": 3.484375, + "learning_rate": 0.006901097746382237, + "loss": 3.0571, + "mean_token_accuracy": 0.41437768936157227, + "num_tokens": 6248998644.0, + "step": 12223 + }, + { + "epoch": 3.305570578691184, + "grad_norm": 3.15625, + "learning_rate": 0.006899694308602254, + "loss": 3.036, + "mean_token_accuracy": 0.4169818162918091, + "num_tokens": 6249522884.0, + "step": 12224 + }, + { + "epoch": 3.3058409951325043, + "grad_norm": 2.96875, + "learning_rate": 0.0068982909966317065, + "loss": 3.1512, + "mean_token_accuracy": 0.39642971754074097, + "num_tokens": 6250008120.0, + "step": 12225 + }, + { + "epoch": 3.3061114115738235, + "grad_norm": 2.75, + "learning_rate": 0.006896887810513657, + "loss": 2.9486, + "mean_token_accuracy": 0.40643107891082764, + "num_tokens": 6250532331.0, + "step": 12226 + }, + { + "epoch": 3.306381828015143, + "grad_norm": 2.671875, + "learning_rate": 0.00689548475029115, + "loss": 2.9433, + "mean_token_accuracy": 0.4154224097728729, + "num_tokens": 6251056589.0, + "step": 12227 + }, + { + "epoch": 3.3066522444564628, + "grad_norm": 3.515625, + "learning_rate": 0.006894081816007247, + "loss": 3.0471, + "mean_token_accuracy": 0.4192550778388977, + "num_tokens": 6251580540.0, + "step": 12228 + }, + { + "epoch": 3.3069226608977824, + "grad_norm": 3.75, + "learning_rate": 0.006892679007704987, + "loss": 2.9659, + "mean_token_accuracy": 0.4113973081111908, + "num_tokens": 6252104680.0, + "step": 12229 + }, + { + "epoch": 3.307193077339102, + "grad_norm": 3.328125, + "learning_rate": 0.006891276325427412, + "loss": 2.8589, + "mean_token_accuracy": 0.42062056064605713, + "num_tokens": 6252628911.0, + "step": 12230 + }, + { + "epoch": 3.3074634937804217, + "grad_norm": 4.9375, + "learning_rate": 0.006889873769217565, + "loss": 12.5369, + "mean_token_accuracy": 0.00012084838817827404, + "num_tokens": 6253121290.0, + "step": 12231 + }, + { + "epoch": 3.3077339102217413, + "grad_norm": 7.78125, + "learning_rate": 0.006888471339118477, + "loss": 2.9514, + "mean_token_accuracy": 0.4026148319244385, + "num_tokens": 6253645515.0, + "step": 12232 + }, + { + "epoch": 3.308004326663061, + "grad_norm": 2.171875, + "learning_rate": 0.006887069035173177, + "loss": 3.0804, + "mean_token_accuracy": 0.4136617183685303, + "num_tokens": 6254169753.0, + "step": 12233 + }, + { + "epoch": 3.3082747431043806, + "grad_norm": 2.515625, + "learning_rate": 0.006885666857424698, + "loss": 2.9458, + "mean_token_accuracy": 0.4236527979373932, + "num_tokens": 6254694023.0, + "step": 12234 + }, + { + "epoch": 3.3085451595457003, + "grad_norm": 3.359375, + "learning_rate": 0.006884264805916055, + "loss": 2.7919, + "mean_token_accuracy": 0.4165130853652954, + "num_tokens": 6255218299.0, + "step": 12235 + }, + { + "epoch": 3.30881557598702, + "grad_norm": 2.625, + "learning_rate": 0.006882862880690275, + "loss": 3.0422, + "mean_token_accuracy": 0.41231653094291687, + "num_tokens": 6255742465.0, + "step": 12236 + }, + { + "epoch": 3.3090859924283396, + "grad_norm": 9.875, + "learning_rate": 0.006881461081790366, + "loss": 2.8858, + "mean_token_accuracy": 0.44962358474731445, + "num_tokens": 6256266641.0, + "step": 12237 + }, + { + "epoch": 3.309356408869659, + "grad_norm": 12.3125, + "learning_rate": 0.006880059409259347, + "loss": 2.7458, + "mean_token_accuracy": 0.4686344265937805, + "num_tokens": 6256790918.0, + "step": 12238 + }, + { + "epoch": 3.309626825310979, + "grad_norm": 2.109375, + "learning_rate": 0.006878657863140219, + "loss": 3.1189, + "mean_token_accuracy": 0.3976569175720215, + "num_tokens": 6257315165.0, + "step": 12239 + }, + { + "epoch": 3.3098972417522985, + "grad_norm": 3.03125, + "learning_rate": 0.0068772564434759886, + "loss": 3.1219, + "mean_token_accuracy": 0.3903887867927551, + "num_tokens": 6257839386.0, + "step": 12240 + }, + { + "epoch": 3.310167658193618, + "grad_norm": 3.171875, + "learning_rate": 0.006875855150309651, + "loss": 3.0184, + "mean_token_accuracy": 0.4020165801048279, + "num_tokens": 6258363645.0, + "step": 12241 + }, + { + "epoch": 3.3104380746349378, + "grad_norm": 2.84375, + "learning_rate": 0.006874453983684212, + "loss": 2.993, + "mean_token_accuracy": 0.41294023394584656, + "num_tokens": 6258887822.0, + "step": 12242 + }, + { + "epoch": 3.3107084910762574, + "grad_norm": 3.140625, + "learning_rate": 0.006873052943642655, + "loss": 3.0896, + "mean_token_accuracy": 0.4053592383861542, + "num_tokens": 6259368392.0, + "step": 12243 + }, + { + "epoch": 3.310978907517577, + "grad_norm": 2.578125, + "learning_rate": 0.006871652030227966, + "loss": 2.8985, + "mean_token_accuracy": 0.4216000437736511, + "num_tokens": 6259892646.0, + "step": 12244 + }, + { + "epoch": 3.3112493239588967, + "grad_norm": 2.84375, + "learning_rate": 0.006870251243483141, + "loss": 3.0204, + "mean_token_accuracy": 0.39186474680900574, + "num_tokens": 6260416904.0, + "step": 12245 + }, + { + "epoch": 3.3115197404002163, + "grad_norm": 3.328125, + "learning_rate": 0.0068688505834511495, + "loss": 3.0452, + "mean_token_accuracy": 0.41332828998565674, + "num_tokens": 6260941175.0, + "step": 12246 + }, + { + "epoch": 3.311790156841536, + "grad_norm": 3.46875, + "learning_rate": 0.006867450050174968, + "loss": 3.203, + "mean_token_accuracy": 0.42356592416763306, + "num_tokens": 6261401602.0, + "step": 12247 + }, + { + "epoch": 3.3120605732828556, + "grad_norm": 2.84375, + "learning_rate": 0.006866049643697575, + "loss": 2.8983, + "mean_token_accuracy": 0.4066651463508606, + "num_tokens": 6261922429.0, + "step": 12248 + }, + { + "epoch": 3.3123309897241753, + "grad_norm": 2.984375, + "learning_rate": 0.006864649364061936, + "loss": 2.8423, + "mean_token_accuracy": 0.3993613123893738, + "num_tokens": 6262446629.0, + "step": 12249 + }, + { + "epoch": 3.312601406165495, + "grad_norm": 2.5625, + "learning_rate": 0.006863249211311013, + "loss": 3.0218, + "mean_token_accuracy": 0.41818487644195557, + "num_tokens": 6262970873.0, + "step": 12250 + }, + { + "epoch": 3.3128718226068146, + "grad_norm": 10.0, + "learning_rate": 0.00686184918548777, + "loss": 10.2497, + "mean_token_accuracy": 0.017206883057951927, + "num_tokens": 6263433471.0, + "step": 12251 + }, + { + "epoch": 3.313142239048134, + "grad_norm": 9.0, + "learning_rate": 0.006860449286635166, + "loss": 3.2942, + "mean_token_accuracy": 0.37195050716400146, + "num_tokens": 6263957531.0, + "step": 12252 + }, + { + "epoch": 3.313412655489454, + "grad_norm": 2.390625, + "learning_rate": 0.006859049514796149, + "loss": 3.1444, + "mean_token_accuracy": 0.39959800243377686, + "num_tokens": 6264427293.0, + "step": 12253 + }, + { + "epoch": 3.3136830719307735, + "grad_norm": 2.875, + "learning_rate": 0.0068576498700136735, + "loss": 2.8254, + "mean_token_accuracy": 0.4165475368499756, + "num_tokens": 6264926444.0, + "step": 12254 + }, + { + "epoch": 3.313953488372093, + "grad_norm": 2.71875, + "learning_rate": 0.006856250352330683, + "loss": 2.9792, + "mean_token_accuracy": 0.4131348133087158, + "num_tokens": 6265424040.0, + "step": 12255 + }, + { + "epoch": 3.314223904813413, + "grad_norm": 3.578125, + "learning_rate": 0.006854850961790112, + "loss": 3.1046, + "mean_token_accuracy": 0.419177383184433, + "num_tokens": 6265948307.0, + "step": 12256 + }, + { + "epoch": 3.3144943212547324, + "grad_norm": 3.375, + "learning_rate": 0.006853451698434908, + "loss": 3.0219, + "mean_token_accuracy": 0.41998976469039917, + "num_tokens": 6266466425.0, + "step": 12257 + }, + { + "epoch": 3.314764737696052, + "grad_norm": 2.453125, + "learning_rate": 0.006852052562308002, + "loss": 2.8188, + "mean_token_accuracy": 0.41690921783447266, + "num_tokens": 6266986159.0, + "step": 12258 + }, + { + "epoch": 3.3150351541373717, + "grad_norm": 2.640625, + "learning_rate": 0.006850653553452315, + "loss": 3.2138, + "mean_token_accuracy": 0.39808136224746704, + "num_tokens": 6267510434.0, + "step": 12259 + }, + { + "epoch": 3.3153055705786914, + "grad_norm": 2.328125, + "learning_rate": 0.006849254671910785, + "loss": 2.8301, + "mean_token_accuracy": 0.4290577471256256, + "num_tokens": 6268034712.0, + "step": 12260 + }, + { + "epoch": 3.315575987020011, + "grad_norm": 2.640625, + "learning_rate": 0.006847855917726328, + "loss": 3.0496, + "mean_token_accuracy": 0.41845792531967163, + "num_tokens": 6268558792.0, + "step": 12261 + }, + { + "epoch": 3.3158464034613306, + "grad_norm": 2.28125, + "learning_rate": 0.0068464572909418585, + "loss": 2.8271, + "mean_token_accuracy": 0.43481171131134033, + "num_tokens": 6269025806.0, + "step": 12262 + }, + { + "epoch": 3.31611681990265, + "grad_norm": 2.5625, + "learning_rate": 0.006845058791600296, + "loss": 2.8645, + "mean_token_accuracy": 0.41542044281959534, + "num_tokens": 6269550025.0, + "step": 12263 + }, + { + "epoch": 3.31638723634397, + "grad_norm": 3.484375, + "learning_rate": 0.006843660419744549, + "loss": 3.0752, + "mean_token_accuracy": 0.3843584656715393, + "num_tokens": 6270074235.0, + "step": 12264 + }, + { + "epoch": 3.316657652785289, + "grad_norm": 3.171875, + "learning_rate": 0.00684226217541752, + "loss": 2.9883, + "mean_token_accuracy": 0.4125608205795288, + "num_tokens": 6270598329.0, + "step": 12265 + }, + { + "epoch": 3.316928069226609, + "grad_norm": 3.125, + "learning_rate": 0.006840864058662119, + "loss": 2.7464, + "mean_token_accuracy": 0.428261935710907, + "num_tokens": 6271122508.0, + "step": 12266 + }, + { + "epoch": 3.3171984856679284, + "grad_norm": 3.34375, + "learning_rate": 0.006839466069521237, + "loss": 2.5772, + "mean_token_accuracy": 0.4429889917373657, + "num_tokens": 6271646751.0, + "step": 12267 + }, + { + "epoch": 3.317468902109248, + "grad_norm": 3.046875, + "learning_rate": 0.006838068208037772, + "loss": 2.9498, + "mean_token_accuracy": 0.4391429126262665, + "num_tokens": 6272156801.0, + "step": 12268 + }, + { + "epoch": 3.3177393185505677, + "grad_norm": 3.625, + "learning_rate": 0.006836670474254617, + "loss": 2.9886, + "mean_token_accuracy": 0.42182987928390503, + "num_tokens": 6272681060.0, + "step": 12269 + }, + { + "epoch": 3.3180097349918873, + "grad_norm": 3.453125, + "learning_rate": 0.00683527286821465, + "loss": 2.8485, + "mean_token_accuracy": 0.41674354672431946, + "num_tokens": 6273205257.0, + "step": 12270 + }, + { + "epoch": 3.318280151433207, + "grad_norm": 8.5, + "learning_rate": 0.006833875389960764, + "loss": 9.474, + "mean_token_accuracy": 0.01559411734342575, + "num_tokens": 6273729272.0, + "step": 12271 + }, + { + "epoch": 3.3185505678745266, + "grad_norm": 7.5, + "learning_rate": 0.006832478039535832, + "loss": 3.4551, + "mean_token_accuracy": 0.3769385814666748, + "num_tokens": 6274253529.0, + "step": 12272 + }, + { + "epoch": 3.3188209843158463, + "grad_norm": 3.34375, + "learning_rate": 0.006831080816982727, + "loss": 2.9222, + "mean_token_accuracy": 0.4160763621330261, + "num_tokens": 6274777785.0, + "step": 12273 + }, + { + "epoch": 3.319091400757166, + "grad_norm": 2.515625, + "learning_rate": 0.006829683722344325, + "loss": 3.0274, + "mean_token_accuracy": 0.4061865508556366, + "num_tokens": 6275302046.0, + "step": 12274 + }, + { + "epoch": 3.3193618171984856, + "grad_norm": 3.015625, + "learning_rate": 0.006828286755663491, + "loss": 2.8671, + "mean_token_accuracy": 0.4115096926689148, + "num_tokens": 6275826319.0, + "step": 12275 + }, + { + "epoch": 3.319632233639805, + "grad_norm": 3.140625, + "learning_rate": 0.006826889916983084, + "loss": 2.7834, + "mean_token_accuracy": 0.4717913866043091, + "num_tokens": 6276350542.0, + "step": 12276 + }, + { + "epoch": 3.319902650081125, + "grad_norm": 2.703125, + "learning_rate": 0.00682549320634597, + "loss": 2.9515, + "mean_token_accuracy": 0.40840768814086914, + "num_tokens": 6276874689.0, + "step": 12277 + }, + { + "epoch": 3.3201730665224445, + "grad_norm": 3.03125, + "learning_rate": 0.006824096623794997, + "loss": 2.9666, + "mean_token_accuracy": 0.42207324504852295, + "num_tokens": 6277375625.0, + "step": 12278 + }, + { + "epoch": 3.320443482963764, + "grad_norm": 2.828125, + "learning_rate": 0.0068227001693730235, + "loss": 2.9753, + "mean_token_accuracy": 0.4367596209049225, + "num_tokens": 6277899735.0, + "step": 12279 + }, + { + "epoch": 3.3207138994050838, + "grad_norm": 3.265625, + "learning_rate": 0.006821303843122893, + "loss": 3.1813, + "mean_token_accuracy": 0.4030373990535736, + "num_tokens": 6278423910.0, + "step": 12280 + }, + { + "epoch": 3.3209843158464034, + "grad_norm": 3.453125, + "learning_rate": 0.006819907645087449, + "loss": 3.0286, + "mean_token_accuracy": 0.4067755937576294, + "num_tokens": 6278948113.0, + "step": 12281 + }, + { + "epoch": 3.321254732287723, + "grad_norm": 3.421875, + "learning_rate": 0.006818511575309529, + "loss": 3.1273, + "mean_token_accuracy": 0.4011967182159424, + "num_tokens": 6279472399.0, + "step": 12282 + }, + { + "epoch": 3.3215251487290427, + "grad_norm": 3.515625, + "learning_rate": 0.006817115633831974, + "loss": 3.0209, + "mean_token_accuracy": 0.42744117975234985, + "num_tokens": 6279984956.0, + "step": 12283 + }, + { + "epoch": 3.3217955651703623, + "grad_norm": 2.96875, + "learning_rate": 0.006815719820697614, + "loss": 3.0302, + "mean_token_accuracy": 0.45573967695236206, + "num_tokens": 6280445222.0, + "step": 12284 + }, + { + "epoch": 3.322065981611682, + "grad_norm": 3.046875, + "learning_rate": 0.00681432413594927, + "loss": 2.9685, + "mean_token_accuracy": 0.42900246381759644, + "num_tokens": 6280958525.0, + "step": 12285 + }, + { + "epoch": 3.3223363980530016, + "grad_norm": 2.609375, + "learning_rate": 0.006812928579629775, + "loss": 3.0181, + "mean_token_accuracy": 0.4145922064781189, + "num_tokens": 6281482570.0, + "step": 12286 + }, + { + "epoch": 3.3226068144943213, + "grad_norm": 3.1875, + "learning_rate": 0.00681153315178194, + "loss": 3.019, + "mean_token_accuracy": 0.4192279279232025, + "num_tokens": 6282006806.0, + "step": 12287 + }, + { + "epoch": 3.322877230935641, + "grad_norm": 3.171875, + "learning_rate": 0.006810137852448592, + "loss": 2.833, + "mean_token_accuracy": 0.43990159034729004, + "num_tokens": 6282531002.0, + "step": 12288 + }, + { + "epoch": 3.3231476473769606, + "grad_norm": 2.625, + "learning_rate": 0.006808742681672533, + "loss": 3.0177, + "mean_token_accuracy": 0.4068228006362915, + "num_tokens": 6283055233.0, + "step": 12289 + }, + { + "epoch": 3.32341806381828, + "grad_norm": 17.25, + "learning_rate": 0.00680734763949657, + "loss": 2.8922, + "mean_token_accuracy": 0.43988943099975586, + "num_tokens": 6283579507.0, + "step": 12290 + }, + { + "epoch": 3.3236884802596, + "grad_norm": 44.25, + "learning_rate": 0.006805952725963518, + "loss": 24.4617, + "mean_token_accuracy": 0.03305112570524216, + "num_tokens": 6284078980.0, + "step": 12291 + }, + { + "epoch": 3.3239588967009195, + "grad_norm": 6.15625, + "learning_rate": 0.006804557941116164, + "loss": 2.8869, + "mean_token_accuracy": 0.4110592007637024, + "num_tokens": 6284586514.0, + "step": 12292 + }, + { + "epoch": 3.324229313142239, + "grad_norm": 2.734375, + "learning_rate": 0.006803163284997316, + "loss": 3.1187, + "mean_token_accuracy": 0.4061509370803833, + "num_tokens": 6285072154.0, + "step": 12293 + }, + { + "epoch": 3.3244997295835588, + "grad_norm": 21.0, + "learning_rate": 0.006801768757649755, + "loss": 2.8284, + "mean_token_accuracy": 0.44308310747146606, + "num_tokens": 6285596125.0, + "step": 12294 + }, + { + "epoch": 3.3247701460248784, + "grad_norm": 5.09375, + "learning_rate": 0.006800374359116279, + "loss": 2.9743, + "mean_token_accuracy": 0.3828388452529907, + "num_tokens": 6286120225.0, + "step": 12295 + }, + { + "epoch": 3.325040562466198, + "grad_norm": 2.921875, + "learning_rate": 0.006798980089439666, + "loss": 2.9641, + "mean_token_accuracy": 0.4057018458843231, + "num_tokens": 6286644439.0, + "step": 12296 + }, + { + "epoch": 3.3253109789075177, + "grad_norm": 2.953125, + "learning_rate": 0.0067975859486627, + "loss": 2.9506, + "mean_token_accuracy": 0.42132893204689026, + "num_tokens": 6287122902.0, + "step": 12297 + }, + { + "epoch": 3.3255813953488373, + "grad_norm": 4.59375, + "learning_rate": 0.006796191936828156, + "loss": 2.9902, + "mean_token_accuracy": 0.44176027178764343, + "num_tokens": 6287610987.0, + "step": 12298 + }, + { + "epoch": 3.325851811790157, + "grad_norm": 3.5, + "learning_rate": 0.006794798053978801, + "loss": 3.0768, + "mean_token_accuracy": 0.4118964970111847, + "num_tokens": 6288135265.0, + "step": 12299 + }, + { + "epoch": 3.3261222282314766, + "grad_norm": 3.1875, + "learning_rate": 0.006793404300157414, + "loss": 2.8222, + "mean_token_accuracy": 0.41304662823677063, + "num_tokens": 6288659503.0, + "step": 12300 + }, + { + "epoch": 3.3263926446727963, + "grad_norm": 3.015625, + "learning_rate": 0.006792010675406753, + "loss": 2.9513, + "mean_token_accuracy": 0.42190104722976685, + "num_tokens": 6289183758.0, + "step": 12301 + }, + { + "epoch": 3.326663061114116, + "grad_norm": 3.265625, + "learning_rate": 0.006790617179769576, + "loss": 2.9955, + "mean_token_accuracy": 0.4191775918006897, + "num_tokens": 6289707930.0, + "step": 12302 + }, + { + "epoch": 3.3269334775554356, + "grad_norm": 3.0, + "learning_rate": 0.006789223813288648, + "loss": 3.1191, + "mean_token_accuracy": 0.41338732838630676, + "num_tokens": 6290181680.0, + "step": 12303 + }, + { + "epoch": 3.3272038939967548, + "grad_norm": 2.75, + "learning_rate": 0.006787830576006714, + "loss": 2.9209, + "mean_token_accuracy": 0.4317379593849182, + "num_tokens": 6290705860.0, + "step": 12304 + }, + { + "epoch": 3.327474310438075, + "grad_norm": 2.96875, + "learning_rate": 0.006786437467966524, + "loss": 3.0206, + "mean_token_accuracy": 0.40557658672332764, + "num_tokens": 6291229998.0, + "step": 12305 + }, + { + "epoch": 3.327744726879394, + "grad_norm": 3.03125, + "learning_rate": 0.006785044489210824, + "loss": 2.8997, + "mean_token_accuracy": 0.4483277201652527, + "num_tokens": 6291754124.0, + "step": 12306 + }, + { + "epoch": 3.328015143320714, + "grad_norm": 2.734375, + "learning_rate": 0.006783651639782354, + "loss": 2.7589, + "mean_token_accuracy": 0.43910473585128784, + "num_tokens": 6292278341.0, + "step": 12307 + }, + { + "epoch": 3.3282855597620333, + "grad_norm": 2.765625, + "learning_rate": 0.00678225891972385, + "loss": 2.8472, + "mean_token_accuracy": 0.43924325704574585, + "num_tokens": 6292802583.0, + "step": 12308 + }, + { + "epoch": 3.328555976203353, + "grad_norm": 33.75, + "learning_rate": 0.0067808663290780495, + "loss": 2.8952, + "mean_token_accuracy": 0.42754167318344116, + "num_tokens": 6293326738.0, + "step": 12309 + }, + { + "epoch": 3.3288263926446726, + "grad_norm": 4.53125, + "learning_rate": 0.0067794738678876735, + "loss": 3.022, + "mean_token_accuracy": 0.42036664485931396, + "num_tokens": 6293758104.0, + "step": 12310 + }, + { + "epoch": 3.3290968090859923, + "grad_norm": 13.875, + "learning_rate": 0.006778081536195455, + "loss": 10.1546, + "mean_token_accuracy": 0.00019865042122546583, + "num_tokens": 6294282348.0, + "step": 12311 + }, + { + "epoch": 3.329367225527312, + "grad_norm": 6.0, + "learning_rate": 0.0067766893340441125, + "loss": 3.1269, + "mean_token_accuracy": 0.3917089104652405, + "num_tokens": 6294806549.0, + "step": 12312 + }, + { + "epoch": 3.3296376419686315, + "grad_norm": 2.53125, + "learning_rate": 0.006775297261476355, + "loss": 3.1885, + "mean_token_accuracy": 0.3933540880680084, + "num_tokens": 6295330806.0, + "step": 12313 + }, + { + "epoch": 3.329908058409951, + "grad_norm": 3.078125, + "learning_rate": 0.006773905318534906, + "loss": 3.168, + "mean_token_accuracy": 0.39714157581329346, + "num_tokens": 6295854982.0, + "step": 12314 + }, + { + "epoch": 3.330178474851271, + "grad_norm": 2.828125, + "learning_rate": 0.00677251350526247, + "loss": 3.041, + "mean_token_accuracy": 0.41158440709114075, + "num_tokens": 6296379215.0, + "step": 12315 + }, + { + "epoch": 3.3304488912925905, + "grad_norm": 2.921875, + "learning_rate": 0.006771121821701749, + "loss": 3.0105, + "mean_token_accuracy": 0.4288334250450134, + "num_tokens": 6296903374.0, + "step": 12316 + }, + { + "epoch": 3.33071930773391, + "grad_norm": 3.0, + "learning_rate": 0.00676973026789545, + "loss": 2.9448, + "mean_token_accuracy": 0.41691625118255615, + "num_tokens": 6297427657.0, + "step": 12317 + }, + { + "epoch": 3.3309897241752298, + "grad_norm": 2.796875, + "learning_rate": 0.006768338843886266, + "loss": 3.0467, + "mean_token_accuracy": 0.4185822010040283, + "num_tokens": 6297951816.0, + "step": 12318 + }, + { + "epoch": 3.3312601406165494, + "grad_norm": 3.234375, + "learning_rate": 0.006766947549716889, + "loss": 3.0466, + "mean_token_accuracy": 0.40709006786346436, + "num_tokens": 6298476095.0, + "step": 12319 + }, + { + "epoch": 3.331530557057869, + "grad_norm": 2.921875, + "learning_rate": 0.0067655563854300115, + "loss": 2.899, + "mean_token_accuracy": 0.4044165313243866, + "num_tokens": 6299000125.0, + "step": 12320 + }, + { + "epoch": 3.3318009734991887, + "grad_norm": 2.15625, + "learning_rate": 0.006764165351068314, + "loss": 3.1345, + "mean_token_accuracy": 0.42632007598876953, + "num_tokens": 6299465202.0, + "step": 12321 + }, + { + "epoch": 3.3320713899405083, + "grad_norm": 3.3125, + "learning_rate": 0.00676277444667448, + "loss": 2.9866, + "mean_token_accuracy": 0.42240816354751587, + "num_tokens": 6299989479.0, + "step": 12322 + }, + { + "epoch": 3.332341806381828, + "grad_norm": 2.453125, + "learning_rate": 0.006761383672291191, + "loss": 2.8857, + "mean_token_accuracy": 0.45043277740478516, + "num_tokens": 6300394061.0, + "step": 12323 + }, + { + "epoch": 3.3326122228231476, + "grad_norm": 3.140625, + "learning_rate": 0.006759993027961116, + "loss": 2.6409, + "mean_token_accuracy": 0.4933958649635315, + "num_tokens": 6300918216.0, + "step": 12324 + }, + { + "epoch": 3.3328826392644673, + "grad_norm": 3.1875, + "learning_rate": 0.006758602513726919, + "loss": 3.046, + "mean_token_accuracy": 0.41549235582351685, + "num_tokens": 6301419010.0, + "step": 12325 + }, + { + "epoch": 3.333153055705787, + "grad_norm": 3.8125, + "learning_rate": 0.006757212129631274, + "loss": 2.8243, + "mean_token_accuracy": 0.43781980872154236, + "num_tokens": 6301897562.0, + "step": 12326 + }, + { + "epoch": 3.3334234721471065, + "grad_norm": 2.9375, + "learning_rate": 0.006755821875716839, + "loss": 2.982, + "mean_token_accuracy": 0.39774736762046814, + "num_tokens": 6302421601.0, + "step": 12327 + }, + { + "epoch": 3.333693888588426, + "grad_norm": 2.859375, + "learning_rate": 0.006754431752026267, + "loss": 2.8802, + "mean_token_accuracy": 0.437555730342865, + "num_tokens": 6302945825.0, + "step": 12328 + }, + { + "epoch": 3.333964305029746, + "grad_norm": 2.859375, + "learning_rate": 0.006753041758602216, + "loss": 2.8399, + "mean_token_accuracy": 0.4282524883747101, + "num_tokens": 6303469889.0, + "step": 12329 + }, + { + "epoch": 3.3342347214710655, + "grad_norm": 2.984375, + "learning_rate": 0.0067516518954873345, + "loss": 2.8023, + "mean_token_accuracy": 0.44656381011009216, + "num_tokens": 6303955357.0, + "step": 12330 + }, + { + "epoch": 3.334505137912385, + "grad_norm": 18.875, + "learning_rate": 0.006750262162724264, + "loss": 15.0047, + "mean_token_accuracy": 0.029062677174806595, + "num_tokens": 6304479597.0, + "step": 12331 + }, + { + "epoch": 3.3347755543537048, + "grad_norm": 6.78125, + "learning_rate": 0.00674887256035565, + "loss": 3.2151, + "mean_token_accuracy": 0.4174919128417969, + "num_tokens": 6304895122.0, + "step": 12332 + }, + { + "epoch": 3.3350459707950244, + "grad_norm": 2.828125, + "learning_rate": 0.006747483088424123, + "loss": 2.6535, + "mean_token_accuracy": 0.4837649166584015, + "num_tokens": 6305382670.0, + "step": 12333 + }, + { + "epoch": 3.335316387236344, + "grad_norm": 2.484375, + "learning_rate": 0.006746093746972326, + "loss": 2.8971, + "mean_token_accuracy": 0.41229960322380066, + "num_tokens": 6305906944.0, + "step": 12334 + }, + { + "epoch": 3.3355868036776637, + "grad_norm": 3.78125, + "learning_rate": 0.006744704536042878, + "loss": 3.0587, + "mean_token_accuracy": 0.42920756340026855, + "num_tokens": 6306431211.0, + "step": 12335 + }, + { + "epoch": 3.3358572201189833, + "grad_norm": 2.859375, + "learning_rate": 0.006743315455678414, + "loss": 2.9071, + "mean_token_accuracy": 0.42508718371391296, + "num_tokens": 6306955453.0, + "step": 12336 + }, + { + "epoch": 3.336127636560303, + "grad_norm": 2.3125, + "learning_rate": 0.006741926505921545, + "loss": 2.8879, + "mean_token_accuracy": 0.4288066625595093, + "num_tokens": 6307479548.0, + "step": 12337 + }, + { + "epoch": 3.3363980530016226, + "grad_norm": 2.859375, + "learning_rate": 0.006740537686814897, + "loss": 3.0305, + "mean_token_accuracy": 0.4315195381641388, + "num_tokens": 6308003659.0, + "step": 12338 + }, + { + "epoch": 3.3366684694429423, + "grad_norm": 2.609375, + "learning_rate": 0.006739148998401076, + "loss": 2.9619, + "mean_token_accuracy": 0.4142974615097046, + "num_tokens": 6308527804.0, + "step": 12339 + }, + { + "epoch": 3.336938885884262, + "grad_norm": 2.640625, + "learning_rate": 0.006737760440722698, + "loss": 3.0481, + "mean_token_accuracy": 0.40915656089782715, + "num_tokens": 6309027436.0, + "step": 12340 + }, + { + "epoch": 3.3372093023255816, + "grad_norm": 2.546875, + "learning_rate": 0.0067363720138223625, + "loss": 2.9909, + "mean_token_accuracy": 0.4083147644996643, + "num_tokens": 6309510194.0, + "step": 12341 + }, + { + "epoch": 3.337479718766901, + "grad_norm": 3.078125, + "learning_rate": 0.006734983717742669, + "loss": 2.9279, + "mean_token_accuracy": 0.4199328124523163, + "num_tokens": 6310034262.0, + "step": 12342 + }, + { + "epoch": 3.337750135208221, + "grad_norm": 3.359375, + "learning_rate": 0.006733595552526221, + "loss": 3.083, + "mean_token_accuracy": 0.4285493791103363, + "num_tokens": 6310558439.0, + "step": 12343 + }, + { + "epoch": 3.3380205516495405, + "grad_norm": 3.46875, + "learning_rate": 0.00673220751821561, + "loss": 2.9987, + "mean_token_accuracy": 0.4165721535682678, + "num_tokens": 6311082644.0, + "step": 12344 + }, + { + "epoch": 3.3382909680908597, + "grad_norm": 2.984375, + "learning_rate": 0.006730819614853417, + "loss": 2.9779, + "mean_token_accuracy": 0.4160820245742798, + "num_tokens": 6311551891.0, + "step": 12345 + }, + { + "epoch": 3.3385613845321798, + "grad_norm": 3.109375, + "learning_rate": 0.006729431842482239, + "loss": 2.9219, + "mean_token_accuracy": 0.410326212644577, + "num_tokens": 6312076155.0, + "step": 12346 + }, + { + "epoch": 3.338831800973499, + "grad_norm": 2.734375, + "learning_rate": 0.006728044201144649, + "loss": 2.9846, + "mean_token_accuracy": 0.42470303177833557, + "num_tokens": 6312600368.0, + "step": 12347 + }, + { + "epoch": 3.339102217414819, + "grad_norm": 3.78125, + "learning_rate": 0.006726656690883224, + "loss": 3.1596, + "mean_token_accuracy": 0.4125215709209442, + "num_tokens": 6313073193.0, + "step": 12348 + }, + { + "epoch": 3.3393726338561383, + "grad_norm": 3.015625, + "learning_rate": 0.00672526931174054, + "loss": 3.0985, + "mean_token_accuracy": 0.41477349400520325, + "num_tokens": 6313576168.0, + "step": 12349 + }, + { + "epoch": 3.339643050297458, + "grad_norm": 2.953125, + "learning_rate": 0.006723882063759163, + "loss": 2.9701, + "mean_token_accuracy": 0.4436350464820862, + "num_tokens": 6314035798.0, + "step": 12350 + }, + { + "epoch": 3.3399134667387775, + "grad_norm": 11.4375, + "learning_rate": 0.006722494946981659, + "loss": 9.9936, + "mean_token_accuracy": 0.005694701336324215, + "num_tokens": 6314556420.0, + "step": 12351 + }, + { + "epoch": 3.340183883180097, + "grad_norm": 6.625, + "learning_rate": 0.006721107961450594, + "loss": 3.1029, + "mean_token_accuracy": 0.3888504207134247, + "num_tokens": 6315080573.0, + "step": 12352 + }, + { + "epoch": 3.340454299621417, + "grad_norm": 1.90625, + "learning_rate": 0.006719721107208519, + "loss": 2.9647, + "mean_token_accuracy": 0.4235008955001831, + "num_tokens": 6315604855.0, + "step": 12353 + }, + { + "epoch": 3.3407247160627365, + "grad_norm": 3.5, + "learning_rate": 0.0067183343842979835, + "loss": 2.9654, + "mean_token_accuracy": 0.4196898341178894, + "num_tokens": 6316126901.0, + "step": 12354 + }, + { + "epoch": 3.340995132504056, + "grad_norm": 4.125, + "learning_rate": 0.006716947792761547, + "loss": 2.9011, + "mean_token_accuracy": 0.44277679920196533, + "num_tokens": 6316632581.0, + "step": 12355 + }, + { + "epoch": 3.3412655489453758, + "grad_norm": 4.375, + "learning_rate": 0.006715561332641742, + "loss": 3.0969, + "mean_token_accuracy": 0.41399115324020386, + "num_tokens": 6317156843.0, + "step": 12356 + }, + { + "epoch": 3.3415359653866954, + "grad_norm": 2.921875, + "learning_rate": 0.0067141750039811186, + "loss": 2.9408, + "mean_token_accuracy": 0.4305635690689087, + "num_tokens": 6317681066.0, + "step": 12357 + }, + { + "epoch": 3.341806381828015, + "grad_norm": 3.484375, + "learning_rate": 0.0067127888068222105, + "loss": 2.8463, + "mean_token_accuracy": 0.40862298011779785, + "num_tokens": 6318205238.0, + "step": 12358 + }, + { + "epoch": 3.3420767982693347, + "grad_norm": 2.03125, + "learning_rate": 0.006711402741207546, + "loss": 2.89, + "mean_token_accuracy": 0.43431371450424194, + "num_tokens": 6318683198.0, + "step": 12359 + }, + { + "epoch": 3.3423472147106543, + "grad_norm": 2.421875, + "learning_rate": 0.006710016807179662, + "loss": 3.0341, + "mean_token_accuracy": 0.42887088656425476, + "num_tokens": 6319207456.0, + "step": 12360 + }, + { + "epoch": 3.342617631151974, + "grad_norm": 2.890625, + "learning_rate": 0.006708631004781076, + "loss": 3.0477, + "mean_token_accuracy": 0.4044383764266968, + "num_tokens": 6319731681.0, + "step": 12361 + }, + { + "epoch": 3.3428880475932936, + "grad_norm": 2.5625, + "learning_rate": 0.006707245334054309, + "loss": 2.857, + "mean_token_accuracy": 0.42842864990234375, + "num_tokens": 6320255941.0, + "step": 12362 + }, + { + "epoch": 3.3431584640346133, + "grad_norm": 2.46875, + "learning_rate": 0.006705859795041882, + "loss": 2.8052, + "mean_token_accuracy": 0.4304811954498291, + "num_tokens": 6320768927.0, + "step": 12363 + }, + { + "epoch": 3.343428880475933, + "grad_norm": 2.34375, + "learning_rate": 0.006704474387786302, + "loss": 2.974, + "mean_token_accuracy": 0.4166015386581421, + "num_tokens": 6321293131.0, + "step": 12364 + }, + { + "epoch": 3.3436992969172525, + "grad_norm": 2.265625, + "learning_rate": 0.006703089112330081, + "loss": 2.6296, + "mean_token_accuracy": 0.43937933444976807, + "num_tokens": 6321817386.0, + "step": 12365 + }, + { + "epoch": 3.343969713358572, + "grad_norm": 2.90625, + "learning_rate": 0.006701703968715725, + "loss": 2.8118, + "mean_token_accuracy": 0.4226526618003845, + "num_tokens": 6322322635.0, + "step": 12366 + }, + { + "epoch": 3.344240129799892, + "grad_norm": 3.21875, + "learning_rate": 0.006700318956985731, + "loss": 2.6888, + "mean_token_accuracy": 0.4331516623497009, + "num_tokens": 6322761231.0, + "step": 12367 + }, + { + "epoch": 3.3445105462412115, + "grad_norm": 2.875, + "learning_rate": 0.006698934077182592, + "loss": 3.0141, + "mean_token_accuracy": 0.4171018600463867, + "num_tokens": 6323217062.0, + "step": 12368 + }, + { + "epoch": 3.344780962682531, + "grad_norm": 2.890625, + "learning_rate": 0.006697549329348809, + "loss": 2.8211, + "mean_token_accuracy": 0.42840903997421265, + "num_tokens": 6323732757.0, + "step": 12369 + }, + { + "epoch": 3.3450513791238508, + "grad_norm": 3.484375, + "learning_rate": 0.006696164713526865, + "loss": 3.1236, + "mean_token_accuracy": 0.3985978364944458, + "num_tokens": 6324256887.0, + "step": 12370 + }, + { + "epoch": 3.3453217955651704, + "grad_norm": 10.0625, + "learning_rate": 0.006694780229759241, + "loss": 10.6695, + "mean_token_accuracy": 0.00032287309295497835, + "num_tokens": 6324669631.0, + "step": 12371 + }, + { + "epoch": 3.34559221200649, + "grad_norm": 6.78125, + "learning_rate": 0.006693395878088422, + "loss": 3.1731, + "mean_token_accuracy": 0.4097215533256531, + "num_tokens": 6325193712.0, + "step": 12372 + }, + { + "epoch": 3.3458626284478097, + "grad_norm": 2.171875, + "learning_rate": 0.006692011658556885, + "loss": 2.8691, + "mean_token_accuracy": 0.42506712675094604, + "num_tokens": 6325715716.0, + "step": 12373 + }, + { + "epoch": 3.3461330448891293, + "grad_norm": 2.578125, + "learning_rate": 0.006690627571207095, + "loss": 2.9945, + "mean_token_accuracy": 0.421251505613327, + "num_tokens": 6326239896.0, + "step": 12374 + }, + { + "epoch": 3.346403461330449, + "grad_norm": 3.40625, + "learning_rate": 0.006689243616081526, + "loss": 3.1325, + "mean_token_accuracy": 0.40805870294570923, + "num_tokens": 6326764124.0, + "step": 12375 + }, + { + "epoch": 3.3466738777717686, + "grad_norm": 2.765625, + "learning_rate": 0.006687859793222641, + "loss": 2.9918, + "mean_token_accuracy": 0.42138248682022095, + "num_tokens": 6327288402.0, + "step": 12376 + }, + { + "epoch": 3.3469442942130883, + "grad_norm": 3.5, + "learning_rate": 0.006686476102672896, + "loss": 2.9536, + "mean_token_accuracy": 0.4183356761932373, + "num_tokens": 6327812494.0, + "step": 12377 + }, + { + "epoch": 3.347214710654408, + "grad_norm": 3.34375, + "learning_rate": 0.006685092544474749, + "loss": 2.9958, + "mean_token_accuracy": 0.39176613092422485, + "num_tokens": 6328315167.0, + "step": 12378 + }, + { + "epoch": 3.3474851270957275, + "grad_norm": 3.3125, + "learning_rate": 0.006683709118670655, + "loss": 3.0105, + "mean_token_accuracy": 0.4149037301540375, + "num_tokens": 6328839383.0, + "step": 12379 + }, + { + "epoch": 3.347755543537047, + "grad_norm": 3.203125, + "learning_rate": 0.006682325825303055, + "loss": 2.7853, + "mean_token_accuracy": 0.40694576501846313, + "num_tokens": 6329363463.0, + "step": 12380 + }, + { + "epoch": 3.348025959978367, + "grad_norm": 2.578125, + "learning_rate": 0.0066809426644144024, + "loss": 3.0404, + "mean_token_accuracy": 0.42831066250801086, + "num_tokens": 6329868234.0, + "step": 12381 + }, + { + "epoch": 3.3482963764196865, + "grad_norm": 3.140625, + "learning_rate": 0.006679559636047124, + "loss": 3.0293, + "mean_token_accuracy": 0.40589553117752075, + "num_tokens": 6330392420.0, + "step": 12382 + }, + { + "epoch": 3.348566792861006, + "grad_norm": 2.75, + "learning_rate": 0.006678176740243666, + "loss": 2.9959, + "mean_token_accuracy": 0.410355806350708, + "num_tokens": 6330916640.0, + "step": 12383 + }, + { + "epoch": 3.3488372093023258, + "grad_norm": 2.703125, + "learning_rate": 0.006676793977046454, + "loss": 2.9023, + "mean_token_accuracy": 0.4196780323982239, + "num_tokens": 6331401930.0, + "step": 12384 + }, + { + "epoch": 3.3491076257436454, + "grad_norm": 2.921875, + "learning_rate": 0.006675411346497915, + "loss": 3.1849, + "mean_token_accuracy": 0.40030035376548767, + "num_tokens": 6331926187.0, + "step": 12385 + }, + { + "epoch": 3.3493780421849646, + "grad_norm": 3.21875, + "learning_rate": 0.006674028848640476, + "loss": 2.8881, + "mean_token_accuracy": 0.42934200167655945, + "num_tokens": 6332450427.0, + "step": 12386 + }, + { + "epoch": 3.3496484586262847, + "grad_norm": 2.890625, + "learning_rate": 0.006672646483516553, + "loss": 2.919, + "mean_token_accuracy": 0.4263894855976105, + "num_tokens": 6332934252.0, + "step": 12387 + }, + { + "epoch": 3.349918875067604, + "grad_norm": 3.234375, + "learning_rate": 0.0066712642511685604, + "loss": 2.8627, + "mean_token_accuracy": 0.445804238319397, + "num_tokens": 6333458383.0, + "step": 12388 + }, + { + "epoch": 3.350189291508924, + "grad_norm": 3.125, + "learning_rate": 0.006669882151638913, + "loss": 2.9257, + "mean_token_accuracy": 0.4200168251991272, + "num_tokens": 6333982666.0, + "step": 12389 + }, + { + "epoch": 3.350459707950243, + "grad_norm": 3.1875, + "learning_rate": 0.006668500184970014, + "loss": 2.8418, + "mean_token_accuracy": 0.4386402666568756, + "num_tokens": 6334500543.0, + "step": 12390 + }, + { + "epoch": 3.350730124391563, + "grad_norm": 30.375, + "learning_rate": 0.006667118351204267, + "loss": 13.3349, + "mean_token_accuracy": 0.04200829565525055, + "num_tokens": 6335024809.0, + "step": 12391 + }, + { + "epoch": 3.3510005408328825, + "grad_norm": 6.90625, + "learning_rate": 0.006665736650384069, + "loss": 3.0836, + "mean_token_accuracy": 0.4156085252761841, + "num_tokens": 6335548943.0, + "step": 12392 + }, + { + "epoch": 3.351270957274202, + "grad_norm": 2.4375, + "learning_rate": 0.00666435508255182, + "loss": 2.982, + "mean_token_accuracy": 0.385664165019989, + "num_tokens": 6336073222.0, + "step": 12393 + }, + { + "epoch": 3.3515413737155217, + "grad_norm": 2.875, + "learning_rate": 0.006662973647749904, + "loss": 2.9968, + "mean_token_accuracy": 0.40185311436653137, + "num_tokens": 6336597407.0, + "step": 12394 + }, + { + "epoch": 3.3518117901568414, + "grad_norm": 3.3125, + "learning_rate": 0.006661592346020715, + "loss": 2.8348, + "mean_token_accuracy": 0.4355524182319641, + "num_tokens": 6337113992.0, + "step": 12395 + }, + { + "epoch": 3.352082206598161, + "grad_norm": 3.390625, + "learning_rate": 0.006660211177406631, + "loss": 2.8418, + "mean_token_accuracy": 0.41948509216308594, + "num_tokens": 6337638164.0, + "step": 12396 + }, + { + "epoch": 3.3523526230394807, + "grad_norm": 3.21875, + "learning_rate": 0.0066588301419500265, + "loss": 2.9656, + "mean_token_accuracy": 0.40674489736557007, + "num_tokens": 6338162394.0, + "step": 12397 + }, + { + "epoch": 3.3526230394808003, + "grad_norm": 2.890625, + "learning_rate": 0.006657449239693284, + "loss": 2.7032, + "mean_token_accuracy": 0.4185577630996704, + "num_tokens": 6338686516.0, + "step": 12398 + }, + { + "epoch": 3.35289345592212, + "grad_norm": 2.375, + "learning_rate": 0.00665606847067877, + "loss": 2.9478, + "mean_token_accuracy": 0.428480863571167, + "num_tokens": 6339185580.0, + "step": 12399 + }, + { + "epoch": 3.3531638723634396, + "grad_norm": 3.328125, + "learning_rate": 0.006654687834948845, + "loss": 3.0923, + "mean_token_accuracy": 0.40137654542922974, + "num_tokens": 6339668132.0, + "step": 12400 + }, + { + "epoch": 3.3534342888047592, + "grad_norm": 3.0625, + "learning_rate": 0.0066533073325458795, + "loss": 2.9304, + "mean_token_accuracy": 0.4084204435348511, + "num_tokens": 6340192314.0, + "step": 12401 + }, + { + "epoch": 3.353704705246079, + "grad_norm": 2.90625, + "learning_rate": 0.006651926963512225, + "loss": 2.9841, + "mean_token_accuracy": 0.4126322269439697, + "num_tokens": 6340716585.0, + "step": 12402 + }, + { + "epoch": 3.3539751216873985, + "grad_norm": 2.90625, + "learning_rate": 0.006650546727890242, + "loss": 3.069, + "mean_token_accuracy": 0.40371137857437134, + "num_tokens": 6341240741.0, + "step": 12403 + }, + { + "epoch": 3.354245538128718, + "grad_norm": 2.75, + "learning_rate": 0.006649166625722278, + "loss": 2.7504, + "mean_token_accuracy": 0.4049428105354309, + "num_tokens": 6341764890.0, + "step": 12404 + }, + { + "epoch": 3.354515954570038, + "grad_norm": 3.546875, + "learning_rate": 0.006647786657050669, + "loss": 2.6994, + "mean_token_accuracy": 0.41113340854644775, + "num_tokens": 6342289024.0, + "step": 12405 + }, + { + "epoch": 3.3547863710113575, + "grad_norm": 3.484375, + "learning_rate": 0.0066464068219177705, + "loss": 2.9767, + "mean_token_accuracy": 0.41210776567459106, + "num_tokens": 6342813305.0, + "step": 12406 + }, + { + "epoch": 3.355056787452677, + "grad_norm": 3.546875, + "learning_rate": 0.0066450271203659095, + "loss": 2.9611, + "mean_token_accuracy": 0.40747523307800293, + "num_tokens": 6343337369.0, + "step": 12407 + }, + { + "epoch": 3.3553272038939967, + "grad_norm": 3.21875, + "learning_rate": 0.0066436475524374265, + "loss": 2.7794, + "mean_token_accuracy": 0.4365118741989136, + "num_tokens": 6343749825.0, + "step": 12408 + }, + { + "epoch": 3.3555976203353164, + "grad_norm": 2.796875, + "learning_rate": 0.006642268118174648, + "loss": 2.977, + "mean_token_accuracy": 0.4238765239715576, + "num_tokens": 6344224588.0, + "step": 12409 + }, + { + "epoch": 3.355868036776636, + "grad_norm": 3.578125, + "learning_rate": 0.0066408888176199, + "loss": 2.7829, + "mean_token_accuracy": 0.4317724406719208, + "num_tokens": 6344748766.0, + "step": 12410 + }, + { + "epoch": 3.3561384532179557, + "grad_norm": 109.0, + "learning_rate": 0.006639509650815499, + "loss": 18.6921, + "mean_token_accuracy": 0.0, + "num_tokens": 6345264288.0, + "step": 12411 + }, + { + "epoch": 3.3564088696592753, + "grad_norm": 6.09375, + "learning_rate": 0.006638130617803768, + "loss": 2.9372, + "mean_token_accuracy": 0.43863409757614136, + "num_tokens": 6345788434.0, + "step": 12412 + }, + { + "epoch": 3.356679286100595, + "grad_norm": 2.359375, + "learning_rate": 0.006636751718627018, + "loss": 2.7863, + "mean_token_accuracy": 0.4695212244987488, + "num_tokens": 6346248245.0, + "step": 12413 + }, + { + "epoch": 3.3569497025419146, + "grad_norm": 2.78125, + "learning_rate": 0.006635372953327552, + "loss": 2.9609, + "mean_token_accuracy": 0.41147753596305847, + "num_tokens": 6346772420.0, + "step": 12414 + }, + { + "epoch": 3.3572201189832342, + "grad_norm": 3.75, + "learning_rate": 0.006633994321947683, + "loss": 3.1874, + "mean_token_accuracy": 0.39948877692222595, + "num_tokens": 6347296628.0, + "step": 12415 + }, + { + "epoch": 3.357490535424554, + "grad_norm": 4.21875, + "learning_rate": 0.006632615824529709, + "loss": 2.9647, + "mean_token_accuracy": 0.419188529253006, + "num_tokens": 6347820858.0, + "step": 12416 + }, + { + "epoch": 3.3577609518658735, + "grad_norm": 2.515625, + "learning_rate": 0.006631237461115922, + "loss": 3.0366, + "mean_token_accuracy": 0.41500818729400635, + "num_tokens": 6348344947.0, + "step": 12417 + }, + { + "epoch": 3.358031368307193, + "grad_norm": 8.6875, + "learning_rate": 0.0066298592317486195, + "loss": 2.9279, + "mean_token_accuracy": 0.44130533933639526, + "num_tokens": 6348863691.0, + "step": 12418 + }, + { + "epoch": 3.358301784748513, + "grad_norm": 2.296875, + "learning_rate": 0.006628481136470088, + "loss": 2.8289, + "mean_token_accuracy": 0.4324769973754883, + "num_tokens": 6349387510.0, + "step": 12419 + }, + { + "epoch": 3.3585722011898325, + "grad_norm": 2.390625, + "learning_rate": 0.006627103175322606, + "loss": 3.0, + "mean_token_accuracy": 0.43727222084999084, + "num_tokens": 6349911606.0, + "step": 12420 + }, + { + "epoch": 3.358842617631152, + "grad_norm": 3.390625, + "learning_rate": 0.006625725348348461, + "loss": 3.0016, + "mean_token_accuracy": 0.4315950870513916, + "num_tokens": 6350430971.0, + "step": 12421 + }, + { + "epoch": 3.3591130340724717, + "grad_norm": 2.640625, + "learning_rate": 0.006624347655589929, + "loss": 2.7758, + "mean_token_accuracy": 0.4435032308101654, + "num_tokens": 6350951546.0, + "step": 12422 + }, + { + "epoch": 3.3593834505137914, + "grad_norm": 3.375, + "learning_rate": 0.006622970097089276, + "loss": 2.8509, + "mean_token_accuracy": 0.4449968934059143, + "num_tokens": 6351411316.0, + "step": 12423 + }, + { + "epoch": 3.359653866955111, + "grad_norm": 3.46875, + "learning_rate": 0.006621592672888774, + "loss": 3.0192, + "mean_token_accuracy": 0.4130239486694336, + "num_tokens": 6351935576.0, + "step": 12424 + }, + { + "epoch": 3.3599242833964307, + "grad_norm": 3.109375, + "learning_rate": 0.006620215383030681, + "loss": 3.0353, + "mean_token_accuracy": 0.41338491439819336, + "num_tokens": 6352459774.0, + "step": 12425 + }, + { + "epoch": 3.3601946998377503, + "grad_norm": 3.5, + "learning_rate": 0.006618838227557267, + "loss": 2.9517, + "mean_token_accuracy": 0.43780940771102905, + "num_tokens": 6352983919.0, + "step": 12426 + }, + { + "epoch": 3.3604651162790695, + "grad_norm": 2.828125, + "learning_rate": 0.006617461206510778, + "loss": 3.0376, + "mean_token_accuracy": 0.4107263386249542, + "num_tokens": 6353508202.0, + "step": 12427 + }, + { + "epoch": 3.3607355327203896, + "grad_norm": 3.140625, + "learning_rate": 0.006616084319933464, + "loss": 3.1158, + "mean_token_accuracy": 0.4026855230331421, + "num_tokens": 6354032312.0, + "step": 12428 + }, + { + "epoch": 3.361005949161709, + "grad_norm": 2.5625, + "learning_rate": 0.006614707567867579, + "loss": 2.9914, + "mean_token_accuracy": 0.4331956207752228, + "num_tokens": 6354548659.0, + "step": 12429 + }, + { + "epoch": 3.361276365603029, + "grad_norm": 3.46875, + "learning_rate": 0.006613330950355362, + "loss": 2.9037, + "mean_token_accuracy": 0.42109712958335876, + "num_tokens": 6355072748.0, + "step": 12430 + }, + { + "epoch": 3.361546782044348, + "grad_norm": 0.82421875, + "learning_rate": 0.006611954467439047, + "loss": 11.2396, + "mean_token_accuracy": 5.983008122711908e-06, + "num_tokens": 6355596905.0, + "step": 12431 + }, + { + "epoch": 3.3618171984856677, + "grad_norm": 7.125, + "learning_rate": 0.00661057811916088, + "loss": 3.2104, + "mean_token_accuracy": 0.4409983456134796, + "num_tokens": 6356009980.0, + "step": 12432 + }, + { + "epoch": 3.3620876149269874, + "grad_norm": 2.828125, + "learning_rate": 0.006609201905563081, + "loss": 3.0058, + "mean_token_accuracy": 0.41580942273139954, + "num_tokens": 6356534075.0, + "step": 12433 + }, + { + "epoch": 3.362358031368307, + "grad_norm": 3.078125, + "learning_rate": 0.006607825826687879, + "loss": 2.8801, + "mean_token_accuracy": 0.42368918657302856, + "num_tokens": 6357058357.0, + "step": 12434 + }, + { + "epoch": 3.3626284478096267, + "grad_norm": 3.234375, + "learning_rate": 0.0066064498825774975, + "loss": 2.832, + "mean_token_accuracy": 0.4304361045360565, + "num_tokens": 6357582608.0, + "step": 12435 + }, + { + "epoch": 3.3628988642509463, + "grad_norm": 2.515625, + "learning_rate": 0.006605074073274157, + "loss": 2.8926, + "mean_token_accuracy": 0.43511438369750977, + "num_tokens": 6358106809.0, + "step": 12436 + }, + { + "epoch": 3.363169280692266, + "grad_norm": 2.71875, + "learning_rate": 0.006603698398820066, + "loss": 2.958, + "mean_token_accuracy": 0.4054228663444519, + "num_tokens": 6358630829.0, + "step": 12437 + }, + { + "epoch": 3.3634396971335856, + "grad_norm": 2.578125, + "learning_rate": 0.006602322859257438, + "loss": 3.0245, + "mean_token_accuracy": 0.42119935154914856, + "num_tokens": 6359154922.0, + "step": 12438 + }, + { + "epoch": 3.3637101135749052, + "grad_norm": 2.921875, + "learning_rate": 0.006600947454628478, + "loss": 2.8163, + "mean_token_accuracy": 0.45110926032066345, + "num_tokens": 6359522169.0, + "step": 12439 + }, + { + "epoch": 3.363980530016225, + "grad_norm": 3.09375, + "learning_rate": 0.006599572184975386, + "loss": 3.0964, + "mean_token_accuracy": 0.42384231090545654, + "num_tokens": 6360016872.0, + "step": 12440 + }, + { + "epoch": 3.3642509464575445, + "grad_norm": 3.578125, + "learning_rate": 0.00659819705034036, + "loss": 2.983, + "mean_token_accuracy": 0.4184383749961853, + "num_tokens": 6360471911.0, + "step": 12441 + }, + { + "epoch": 3.364521362898864, + "grad_norm": 3.3125, + "learning_rate": 0.006596822050765596, + "loss": 2.8965, + "mean_token_accuracy": 0.4313291013240814, + "num_tokens": 6360996153.0, + "step": 12442 + }, + { + "epoch": 3.364791779340184, + "grad_norm": 3.046875, + "learning_rate": 0.006595447186293277, + "loss": 2.8215, + "mean_token_accuracy": 0.44939926266670227, + "num_tokens": 6361516843.0, + "step": 12443 + }, + { + "epoch": 3.3650621957815035, + "grad_norm": 3.359375, + "learning_rate": 0.0065940724569655915, + "loss": 2.9325, + "mean_token_accuracy": 0.4113987684249878, + "num_tokens": 6362041022.0, + "step": 12444 + }, + { + "epoch": 3.365332612222823, + "grad_norm": 3.546875, + "learning_rate": 0.006592697862824724, + "loss": 2.7379, + "mean_token_accuracy": 0.42523685097694397, + "num_tokens": 6362565231.0, + "step": 12445 + }, + { + "epoch": 3.3656030286641427, + "grad_norm": 2.484375, + "learning_rate": 0.006591323403912841, + "loss": 2.8654, + "mean_token_accuracy": 0.4356461465358734, + "num_tokens": 6363075430.0, + "step": 12446 + }, + { + "epoch": 3.3658734451054624, + "grad_norm": 3.421875, + "learning_rate": 0.006589949080272126, + "loss": 3.1451, + "mean_token_accuracy": 0.408346027135849, + "num_tokens": 6363546435.0, + "step": 12447 + }, + { + "epoch": 3.366143861546782, + "grad_norm": 3.046875, + "learning_rate": 0.006588574891944738, + "loss": 3.0399, + "mean_token_accuracy": 0.412058562040329, + "num_tokens": 6364070698.0, + "step": 12448 + }, + { + "epoch": 3.3664142779881017, + "grad_norm": 3.28125, + "learning_rate": 0.006587200838972849, + "loss": 2.8152, + "mean_token_accuracy": 0.4310852885246277, + "num_tokens": 6364594927.0, + "step": 12449 + }, + { + "epoch": 3.3666846944294213, + "grad_norm": 2.9375, + "learning_rate": 0.006585826921398612, + "loss": 3.0546, + "mean_token_accuracy": 0.4056306481361389, + "num_tokens": 6365119147.0, + "step": 12450 + }, + { + "epoch": 3.366955110870741, + "grad_norm": 199.0, + "learning_rate": 0.006584453139264186, + "loss": 10.9887, + "mean_token_accuracy": 9.020410652738065e-05, + "num_tokens": 6365626680.0, + "step": 12451 + }, + { + "epoch": 3.3672255273120606, + "grad_norm": 69.5, + "learning_rate": 0.006583079492611728, + "loss": 3.3251, + "mean_token_accuracy": 0.35110563039779663, + "num_tokens": 6366150941.0, + "step": 12452 + }, + { + "epoch": 3.3674959437533802, + "grad_norm": 7.71875, + "learning_rate": 0.006581705981483379, + "loss": 3.1237, + "mean_token_accuracy": 0.3598983585834503, + "num_tokens": 6366675109.0, + "step": 12453 + }, + { + "epoch": 3.3677663601947, + "grad_norm": 2.96875, + "learning_rate": 0.0065803326059212805, + "loss": 3.1009, + "mean_token_accuracy": 0.39996007084846497, + "num_tokens": 6367199379.0, + "step": 12454 + }, + { + "epoch": 3.3680367766360195, + "grad_norm": 1.96875, + "learning_rate": 0.00657895936596758, + "loss": 3.0449, + "mean_token_accuracy": 0.43425196409225464, + "num_tokens": 6367668616.0, + "step": 12455 + }, + { + "epoch": 3.368307193077339, + "grad_norm": 2.453125, + "learning_rate": 0.006577586261664405, + "loss": 2.9334, + "mean_token_accuracy": 0.43486618995666504, + "num_tokens": 6368192834.0, + "step": 12456 + }, + { + "epoch": 3.368577609518659, + "grad_norm": 3.625, + "learning_rate": 0.006576213293053888, + "loss": 3.095, + "mean_token_accuracy": 0.42807018756866455, + "num_tokens": 6368672118.0, + "step": 12457 + }, + { + "epoch": 3.3688480259599785, + "grad_norm": 3.140625, + "learning_rate": 0.006574840460178159, + "loss": 2.8744, + "mean_token_accuracy": 0.42253828048706055, + "num_tokens": 6369161269.0, + "step": 12458 + }, + { + "epoch": 3.369118442401298, + "grad_norm": 3.21875, + "learning_rate": 0.006573467763079337, + "loss": 3.0189, + "mean_token_accuracy": 0.41466575860977173, + "num_tokens": 6369668696.0, + "step": 12459 + }, + { + "epoch": 3.3693888588426177, + "grad_norm": 3.578125, + "learning_rate": 0.00657209520179954, + "loss": 2.9061, + "mean_token_accuracy": 0.40425869822502136, + "num_tokens": 6370192957.0, + "step": 12460 + }, + { + "epoch": 3.3696592752839374, + "grad_norm": 3.671875, + "learning_rate": 0.006570722776380888, + "loss": 3.0111, + "mean_token_accuracy": 0.3996497094631195, + "num_tokens": 6370717240.0, + "step": 12461 + }, + { + "epoch": 3.369929691725257, + "grad_norm": 3.5625, + "learning_rate": 0.006569350486865481, + "loss": 2.8584, + "mean_token_accuracy": 0.42668938636779785, + "num_tokens": 6371241429.0, + "step": 12462 + }, + { + "epoch": 3.3702001081665767, + "grad_norm": 3.5, + "learning_rate": 0.006567978333295436, + "loss": 3.1066, + "mean_token_accuracy": 0.4119129478931427, + "num_tokens": 6371765510.0, + "step": 12463 + }, + { + "epoch": 3.3704705246078963, + "grad_norm": 3.359375, + "learning_rate": 0.006566606315712844, + "loss": 3.0215, + "mean_token_accuracy": 0.4224291443824768, + "num_tokens": 6372248765.0, + "step": 12464 + }, + { + "epoch": 3.370740941049216, + "grad_norm": 3.109375, + "learning_rate": 0.006565234434159814, + "loss": 2.9002, + "mean_token_accuracy": 0.43458178639411926, + "num_tokens": 6372728169.0, + "step": 12465 + }, + { + "epoch": 3.3710113574905356, + "grad_norm": 2.609375, + "learning_rate": 0.006563862688678427, + "loss": 2.9485, + "mean_token_accuracy": 0.43852269649505615, + "num_tokens": 6373204604.0, + "step": 12466 + }, + { + "epoch": 3.3712817739318552, + "grad_norm": 3.15625, + "learning_rate": 0.006562491079310785, + "loss": 2.8692, + "mean_token_accuracy": 0.4339430332183838, + "num_tokens": 6373728830.0, + "step": 12467 + }, + { + "epoch": 3.3715521903731744, + "grad_norm": 4.8125, + "learning_rate": 0.006561119606098964, + "loss": 2.7765, + "mean_token_accuracy": 0.4428790211677551, + "num_tokens": 6374202353.0, + "step": 12468 + }, + { + "epoch": 3.3718226068144945, + "grad_norm": 3.421875, + "learning_rate": 0.006559748269085047, + "loss": 3.0107, + "mean_token_accuracy": 0.4449109733104706, + "num_tokens": 6374726620.0, + "step": 12469 + }, + { + "epoch": 3.3720930232558137, + "grad_norm": 3.609375, + "learning_rate": 0.006558377068311112, + "loss": 3.1918, + "mean_token_accuracy": 0.38913607597351074, + "num_tokens": 6375247206.0, + "step": 12470 + }, + { + "epoch": 3.372363439697134, + "grad_norm": 164.0, + "learning_rate": 0.006557006003819227, + "loss": 17.2052, + "mean_token_accuracy": 0.0, + "num_tokens": 6375762878.0, + "step": 12471 + }, + { + "epoch": 3.372633856138453, + "grad_norm": 5.125, + "learning_rate": 0.006555635075651468, + "loss": 3.2058, + "mean_token_accuracy": 0.3827436566352844, + "num_tokens": 6376287140.0, + "step": 12472 + }, + { + "epoch": 3.372904272579773, + "grad_norm": 2.140625, + "learning_rate": 0.006554264283849896, + "loss": 2.9155, + "mean_token_accuracy": 0.4556483030319214, + "num_tokens": 6376723273.0, + "step": 12473 + }, + { + "epoch": 3.3731746890210923, + "grad_norm": 2.390625, + "learning_rate": 0.006552893628456564, + "loss": 2.9253, + "mean_token_accuracy": 0.4281635284423828, + "num_tokens": 6377247474.0, + "step": 12474 + }, + { + "epoch": 3.373445105462412, + "grad_norm": 2.8125, + "learning_rate": 0.006551523109513539, + "loss": 2.8965, + "mean_token_accuracy": 0.4219834804534912, + "num_tokens": 6377771749.0, + "step": 12475 + }, + { + "epoch": 3.3737155219037316, + "grad_norm": 2.921875, + "learning_rate": 0.006550152727062866, + "loss": 2.7699, + "mean_token_accuracy": 0.4566037058830261, + "num_tokens": 6378286295.0, + "step": 12476 + }, + { + "epoch": 3.3739859383450512, + "grad_norm": 3.09375, + "learning_rate": 0.00654878248114659, + "loss": 2.897, + "mean_token_accuracy": 0.43257442116737366, + "num_tokens": 6378810486.0, + "step": 12477 + }, + { + "epoch": 3.374256354786371, + "grad_norm": 2.828125, + "learning_rate": 0.006547412371806758, + "loss": 2.8337, + "mean_token_accuracy": 0.4180791974067688, + "num_tokens": 6379334610.0, + "step": 12478 + }, + { + "epoch": 3.3745267712276905, + "grad_norm": 3.15625, + "learning_rate": 0.0065460423990854125, + "loss": 3.0949, + "mean_token_accuracy": 0.39239442348480225, + "num_tokens": 6379858746.0, + "step": 12479 + }, + { + "epoch": 3.37479718766901, + "grad_norm": 3.640625, + "learning_rate": 0.00654467256302458, + "loss": 3.0898, + "mean_token_accuracy": 0.4140033423900604, + "num_tokens": 6380377288.0, + "step": 12480 + }, + { + "epoch": 3.37506760411033, + "grad_norm": 2.703125, + "learning_rate": 0.006543302863666301, + "loss": 2.7454, + "mean_token_accuracy": 0.4382239878177643, + "num_tokens": 6380874266.0, + "step": 12481 + }, + { + "epoch": 3.3753380205516494, + "grad_norm": 2.109375, + "learning_rate": 0.006541933301052594, + "loss": 3.0304, + "mean_token_accuracy": 0.41930314898490906, + "num_tokens": 6381398514.0, + "step": 12482 + }, + { + "epoch": 3.375608436992969, + "grad_norm": 2.6875, + "learning_rate": 0.006540563875225481, + "loss": 2.94, + "mean_token_accuracy": 0.4123480021953583, + "num_tokens": 6381899072.0, + "step": 12483 + }, + { + "epoch": 3.3758788534342887, + "grad_norm": 2.828125, + "learning_rate": 0.006539194586226985, + "loss": 2.995, + "mean_token_accuracy": 0.4088687300682068, + "num_tokens": 6382423330.0, + "step": 12484 + }, + { + "epoch": 3.3761492698756084, + "grad_norm": 3.265625, + "learning_rate": 0.006537825434099121, + "loss": 3.068, + "mean_token_accuracy": 0.4187951683998108, + "num_tokens": 6382947485.0, + "step": 12485 + }, + { + "epoch": 3.376419686316928, + "grad_norm": 3.28125, + "learning_rate": 0.006536456418883887, + "loss": 2.9548, + "mean_token_accuracy": 0.4384062886238098, + "num_tokens": 6383422252.0, + "step": 12486 + }, + { + "epoch": 3.3766901027582477, + "grad_norm": 3.078125, + "learning_rate": 0.006535087540623303, + "loss": 2.9113, + "mean_token_accuracy": 0.4314143657684326, + "num_tokens": 6383894781.0, + "step": 12487 + }, + { + "epoch": 3.3769605191995673, + "grad_norm": 13.8125, + "learning_rate": 0.0065337187993593625, + "loss": 2.9837, + "mean_token_accuracy": 0.462406724691391, + "num_tokens": 6384343676.0, + "step": 12488 + }, + { + "epoch": 3.377230935640887, + "grad_norm": 3.3125, + "learning_rate": 0.006532350195134061, + "loss": 3.1543, + "mean_token_accuracy": 0.3820597529411316, + "num_tokens": 6384854798.0, + "step": 12489 + }, + { + "epoch": 3.3775013520822066, + "grad_norm": 2.359375, + "learning_rate": 0.006530981727989399, + "loss": 3.0042, + "mean_token_accuracy": 0.4191280007362366, + "num_tokens": 6385341208.0, + "step": 12490 + }, + { + "epoch": 3.3777717685235262, + "grad_norm": 37.75, + "learning_rate": 0.006529613397967354, + "loss": 13.1457, + "mean_token_accuracy": 0.02180514484643936, + "num_tokens": 6385807230.0, + "step": 12491 + }, + { + "epoch": 3.378042184964846, + "grad_norm": 7.125, + "learning_rate": 0.006528245205109917, + "loss": 3.1361, + "mean_token_accuracy": 0.3897320032119751, + "num_tokens": 6386331333.0, + "step": 12492 + }, + { + "epoch": 3.3783126014061655, + "grad_norm": 2.453125, + "learning_rate": 0.006526877149459071, + "loss": 2.99, + "mean_token_accuracy": 0.40384793281555176, + "num_tokens": 6386855272.0, + "step": 12493 + }, + { + "epoch": 3.378583017847485, + "grad_norm": 2.0625, + "learning_rate": 0.006525509231056786, + "loss": 3.0764, + "mean_token_accuracy": 0.4251052737236023, + "num_tokens": 6387368626.0, + "step": 12494 + }, + { + "epoch": 3.378853434288805, + "grad_norm": 3.078125, + "learning_rate": 0.00652414144994504, + "loss": 2.8371, + "mean_token_accuracy": 0.42682427167892456, + "num_tokens": 6387892881.0, + "step": 12495 + }, + { + "epoch": 3.3791238507301244, + "grad_norm": 2.53125, + "learning_rate": 0.006522773806165797, + "loss": 3.0509, + "mean_token_accuracy": 0.4125504195690155, + "num_tokens": 6388417062.0, + "step": 12496 + }, + { + "epoch": 3.379394267171444, + "grad_norm": 3.390625, + "learning_rate": 0.006521406299761017, + "loss": 3.0646, + "mean_token_accuracy": 0.4190894365310669, + "num_tokens": 6388941242.0, + "step": 12497 + }, + { + "epoch": 3.3796646836127637, + "grad_norm": 2.75, + "learning_rate": 0.006520038930772665, + "loss": 2.8106, + "mean_token_accuracy": 0.4468245208263397, + "num_tokens": 6389429245.0, + "step": 12498 + }, + { + "epoch": 3.3799351000540834, + "grad_norm": 3.234375, + "learning_rate": 0.0065186716992426934, + "loss": 3.0231, + "mean_token_accuracy": 0.41902536153793335, + "num_tokens": 6389953517.0, + "step": 12499 + }, + { + "epoch": 3.380205516495403, + "grad_norm": 2.5625, + "learning_rate": 0.0065173046052130505, + "loss": 2.9877, + "mean_token_accuracy": 0.4158206880092621, + "num_tokens": 6390477800.0, + "step": 12500 + }, + { + "epoch": 3.3804759329367227, + "grad_norm": 3.5, + "learning_rate": 0.006515937648725687, + "loss": 2.7103, + "mean_token_accuracy": 0.4367784261703491, + "num_tokens": 6390969478.0, + "step": 12501 + }, + { + "epoch": 3.3807463493780423, + "grad_norm": 2.21875, + "learning_rate": 0.006514570829822544, + "loss": 2.8903, + "mean_token_accuracy": 0.44418585300445557, + "num_tokens": 6391458926.0, + "step": 12502 + }, + { + "epoch": 3.381016765819362, + "grad_norm": 3.265625, + "learning_rate": 0.006513204148545555, + "loss": 2.8611, + "mean_token_accuracy": 0.4429681599140167, + "num_tokens": 6391983205.0, + "step": 12503 + }, + { + "epoch": 3.3812871822606816, + "grad_norm": 2.265625, + "learning_rate": 0.00651183760493666, + "loss": 2.8799, + "mean_token_accuracy": 0.44146186113357544, + "num_tokens": 6392465260.0, + "step": 12504 + }, + { + "epoch": 3.3815575987020012, + "grad_norm": 2.796875, + "learning_rate": 0.006510471199037784, + "loss": 2.8685, + "mean_token_accuracy": 0.4270472526550293, + "num_tokens": 6392989518.0, + "step": 12505 + }, + { + "epoch": 3.381828015143321, + "grad_norm": 2.828125, + "learning_rate": 0.006509104930890858, + "loss": 2.9484, + "mean_token_accuracy": 0.42401278018951416, + "num_tokens": 6393513773.0, + "step": 12506 + }, + { + "epoch": 3.3820984315846405, + "grad_norm": 3.28125, + "learning_rate": 0.006507738800537796, + "loss": 2.8819, + "mean_token_accuracy": 0.4242328107357025, + "num_tokens": 6394037978.0, + "step": 12507 + }, + { + "epoch": 3.38236884802596, + "grad_norm": 3.15625, + "learning_rate": 0.006506372808020522, + "loss": 3.0759, + "mean_token_accuracy": 0.38456910848617554, + "num_tokens": 6394562248.0, + "step": 12508 + }, + { + "epoch": 3.3826392644672794, + "grad_norm": 2.734375, + "learning_rate": 0.006505006953380941, + "loss": 3.1138, + "mean_token_accuracy": 0.40874314308166504, + "num_tokens": 6395086487.0, + "step": 12509 + }, + { + "epoch": 3.3829096809085994, + "grad_norm": 2.625, + "learning_rate": 0.00650364123666097, + "loss": 2.8345, + "mean_token_accuracy": 0.42819035053253174, + "num_tokens": 6395610750.0, + "step": 12510 + }, + { + "epoch": 3.3831800973499186, + "grad_norm": 488.0, + "learning_rate": 0.006502275657902508, + "loss": 15.3407, + "mean_token_accuracy": 0.024908751249313354, + "num_tokens": 6396134896.0, + "step": 12511 + }, + { + "epoch": 3.3834505137912387, + "grad_norm": 8.125, + "learning_rate": 0.006500910217147452, + "loss": 3.3268, + "mean_token_accuracy": 0.4019646644592285, + "num_tokens": 6396597853.0, + "step": 12512 + }, + { + "epoch": 3.383720930232558, + "grad_norm": 60.75, + "learning_rate": 0.006499544914437705, + "loss": 3.7641, + "mean_token_accuracy": 0.3325428366661072, + "num_tokens": 6397122051.0, + "step": 12513 + }, + { + "epoch": 3.383991346673878, + "grad_norm": 4.3125, + "learning_rate": 0.006498179749815156, + "loss": 3.1077, + "mean_token_accuracy": 0.4266939163208008, + "num_tokens": 6397628576.0, + "step": 12514 + }, + { + "epoch": 3.384261763115197, + "grad_norm": 2.65625, + "learning_rate": 0.0064968147233216865, + "loss": 2.897, + "mean_token_accuracy": 0.4295353889465332, + "num_tokens": 6398152856.0, + "step": 12515 + }, + { + "epoch": 3.384532179556517, + "grad_norm": 2.84375, + "learning_rate": 0.00649544983499919, + "loss": 2.9942, + "mean_token_accuracy": 0.40360963344573975, + "num_tokens": 6398673569.0, + "step": 12516 + }, + { + "epoch": 3.3848025959978365, + "grad_norm": 3.0, + "learning_rate": 0.006494085084889534, + "loss": 2.9352, + "mean_token_accuracy": 0.43920812010765076, + "num_tokens": 6399197809.0, + "step": 12517 + }, + { + "epoch": 3.385073012439156, + "grad_norm": 3.484375, + "learning_rate": 0.006492720473034602, + "loss": 2.9276, + "mean_token_accuracy": 0.4322328567504883, + "num_tokens": 6399722042.0, + "step": 12518 + }, + { + "epoch": 3.385343428880476, + "grad_norm": 4.0, + "learning_rate": 0.006491355999476263, + "loss": 2.9803, + "mean_token_accuracy": 0.43338409066200256, + "num_tokens": 6400246322.0, + "step": 12519 + }, + { + "epoch": 3.3856138453217954, + "grad_norm": 3.25, + "learning_rate": 0.006489991664256376, + "loss": 2.9037, + "mean_token_accuracy": 0.4043170213699341, + "num_tokens": 6400770483.0, + "step": 12520 + }, + { + "epoch": 3.385884261763115, + "grad_norm": 2.59375, + "learning_rate": 0.006488627467416808, + "loss": 2.8386, + "mean_token_accuracy": 0.44451260566711426, + "num_tokens": 6401235512.0, + "step": 12521 + }, + { + "epoch": 3.3861546782044347, + "grad_norm": 2.765625, + "learning_rate": 0.0064872634089994195, + "loss": 3.0392, + "mean_token_accuracy": 0.42123666405677795, + "num_tokens": 6401759677.0, + "step": 12522 + }, + { + "epoch": 3.3864250946457544, + "grad_norm": 2.640625, + "learning_rate": 0.006485899489046057, + "loss": 2.8662, + "mean_token_accuracy": 0.423356294631958, + "num_tokens": 6402226960.0, + "step": 12523 + }, + { + "epoch": 3.386695511087074, + "grad_norm": 2.703125, + "learning_rate": 0.006484535707598577, + "loss": 2.9161, + "mean_token_accuracy": 0.41067802906036377, + "num_tokens": 6402751140.0, + "step": 12524 + }, + { + "epoch": 3.3869659275283936, + "grad_norm": 2.515625, + "learning_rate": 0.0064831720646988215, + "loss": 2.8327, + "mean_token_accuracy": 0.44091105461120605, + "num_tokens": 6403275189.0, + "step": 12525 + }, + { + "epoch": 3.3872363439697133, + "grad_norm": 3.71875, + "learning_rate": 0.0064818085603886265, + "loss": 3.0852, + "mean_token_accuracy": 0.42288273572921753, + "num_tokens": 6403799354.0, + "step": 12526 + }, + { + "epoch": 3.387506760411033, + "grad_norm": 3.484375, + "learning_rate": 0.006480445194709834, + "loss": 3.0695, + "mean_token_accuracy": 0.4205809235572815, + "num_tokens": 6404267987.0, + "step": 12527 + }, + { + "epoch": 3.3877771768523526, + "grad_norm": 3.671875, + "learning_rate": 0.0064790819677042755, + "loss": 2.8891, + "mean_token_accuracy": 0.3834249973297119, + "num_tokens": 6404792250.0, + "step": 12528 + }, + { + "epoch": 3.388047593293672, + "grad_norm": 3.09375, + "learning_rate": 0.006477718879413774, + "loss": 2.8597, + "mean_token_accuracy": 0.4379022717475891, + "num_tokens": 6405316486.0, + "step": 12529 + }, + { + "epoch": 3.388318009734992, + "grad_norm": 3.125, + "learning_rate": 0.00647635592988016, + "loss": 2.871, + "mean_token_accuracy": 0.42529261112213135, + "num_tokens": 6405840718.0, + "step": 12530 + }, + { + "epoch": 3.3885884261763115, + "grad_norm": 0.5625, + "learning_rate": 0.006474993119145248, + "loss": 11.288, + "mean_token_accuracy": 1.3730228602071293e-05, + "num_tokens": 6406333265.0, + "step": 12531 + }, + { + "epoch": 3.388858842617631, + "grad_norm": 6.03125, + "learning_rate": 0.006473630447250851, + "loss": 3.1923, + "mean_token_accuracy": 0.39217716455459595, + "num_tokens": 6406857524.0, + "step": 12532 + }, + { + "epoch": 3.389129259058951, + "grad_norm": 2.046875, + "learning_rate": 0.006472267914238787, + "loss": 2.6822, + "mean_token_accuracy": 0.4378356337547302, + "num_tokens": 6407381700.0, + "step": 12533 + }, + { + "epoch": 3.3893996755002704, + "grad_norm": 2.921875, + "learning_rate": 0.006470905520150856, + "loss": 2.9299, + "mean_token_accuracy": 0.4364818334579468, + "num_tokens": 6407905945.0, + "step": 12534 + }, + { + "epoch": 3.38967009194159, + "grad_norm": 2.875, + "learning_rate": 0.006469543265028862, + "loss": 2.8812, + "mean_token_accuracy": 0.42716914415359497, + "num_tokens": 6408430124.0, + "step": 12535 + }, + { + "epoch": 3.3899405083829097, + "grad_norm": 3.171875, + "learning_rate": 0.006468181148914607, + "loss": 2.8823, + "mean_token_accuracy": 0.43494713306427, + "num_tokens": 6408954410.0, + "step": 12536 + }, + { + "epoch": 3.3902109248242294, + "grad_norm": 3.25, + "learning_rate": 0.006466819171849883, + "loss": 2.8861, + "mean_token_accuracy": 0.39391282200813293, + "num_tokens": 6409478631.0, + "step": 12537 + }, + { + "epoch": 3.390481341265549, + "grad_norm": 3.5, + "learning_rate": 0.0064654573338764725, + "loss": 2.9264, + "mean_token_accuracy": 0.40636834502220154, + "num_tokens": 6410002804.0, + "step": 12538 + }, + { + "epoch": 3.3907517577068687, + "grad_norm": 2.578125, + "learning_rate": 0.006464095635036172, + "loss": 2.8632, + "mean_token_accuracy": 0.3963552713394165, + "num_tokens": 6410527011.0, + "step": 12539 + }, + { + "epoch": 3.3910221741481883, + "grad_norm": 2.90625, + "learning_rate": 0.006462734075370751, + "loss": 2.8163, + "mean_token_accuracy": 0.4510953426361084, + "num_tokens": 6410991441.0, + "step": 12540 + }, + { + "epoch": 3.391292590589508, + "grad_norm": 2.75, + "learning_rate": 0.006461372654921997, + "loss": 2.9161, + "mean_token_accuracy": 0.4151019752025604, + "num_tokens": 6411515657.0, + "step": 12541 + }, + { + "epoch": 3.3915630070308276, + "grad_norm": 3.6875, + "learning_rate": 0.006460011373731676, + "loss": 3.1321, + "mean_token_accuracy": 0.41294270753860474, + "num_tokens": 6412039767.0, + "step": 12542 + }, + { + "epoch": 3.3918334234721472, + "grad_norm": 3.390625, + "learning_rate": 0.006458650231841553, + "loss": 2.9526, + "mean_token_accuracy": 0.41318199038505554, + "num_tokens": 6412534606.0, + "step": 12543 + }, + { + "epoch": 3.392103839913467, + "grad_norm": 4.46875, + "learning_rate": 0.006457289229293402, + "loss": 2.7065, + "mean_token_accuracy": 0.4364522695541382, + "num_tokens": 6413058810.0, + "step": 12544 + }, + { + "epoch": 3.3923742563547865, + "grad_norm": 2.984375, + "learning_rate": 0.006455928366128975, + "loss": 2.9274, + "mean_token_accuracy": 0.4305935800075531, + "num_tokens": 6413520223.0, + "step": 12545 + }, + { + "epoch": 3.392644672796106, + "grad_norm": 3.34375, + "learning_rate": 0.006454567642390025, + "loss": 3.0047, + "mean_token_accuracy": 0.4195623993873596, + "num_tokens": 6414044482.0, + "step": 12546 + }, + { + "epoch": 3.392915089237426, + "grad_norm": 3.234375, + "learning_rate": 0.006453207058118311, + "loss": 3.0473, + "mean_token_accuracy": 0.40385594964027405, + "num_tokens": 6414548963.0, + "step": 12547 + }, + { + "epoch": 3.3931855056787454, + "grad_norm": 3.359375, + "learning_rate": 0.0064518466133555745, + "loss": 2.9358, + "mean_token_accuracy": 0.41809558868408203, + "num_tokens": 6415073225.0, + "step": 12548 + }, + { + "epoch": 3.393455922120065, + "grad_norm": 4.125, + "learning_rate": 0.006450486308143559, + "loss": 3.1686, + "mean_token_accuracy": 0.4066464900970459, + "num_tokens": 6415597447.0, + "step": 12549 + }, + { + "epoch": 3.3937263385613847, + "grad_norm": 2.921875, + "learning_rate": 0.006449126142524001, + "loss": 2.8849, + "mean_token_accuracy": 0.4159802496433258, + "num_tokens": 6416121678.0, + "step": 12550 + }, + { + "epoch": 3.3939967550027044, + "grad_norm": 6.09375, + "learning_rate": 0.0064477661165386405, + "loss": 10.5969, + "mean_token_accuracy": 2.7450807465356775e-05, + "num_tokens": 6416585727.0, + "step": 12551 + }, + { + "epoch": 3.3942671714440236, + "grad_norm": 5.96875, + "learning_rate": 0.0064464062302291986, + "loss": 3.2386, + "mean_token_accuracy": 0.3866899907588959, + "num_tokens": 6417109869.0, + "step": 12552 + }, + { + "epoch": 3.3945375878853437, + "grad_norm": 1.8671875, + "learning_rate": 0.006445046483637408, + "loss": 3.1108, + "mean_token_accuracy": 0.42041632533073425, + "num_tokens": 6417634150.0, + "step": 12553 + }, + { + "epoch": 3.394808004326663, + "grad_norm": 24.75, + "learning_rate": 0.006443686876804986, + "loss": 2.8495, + "mean_token_accuracy": 0.41261160373687744, + "num_tokens": 6418158267.0, + "step": 12554 + }, + { + "epoch": 3.395078420767983, + "grad_norm": 4.3125, + "learning_rate": 0.006442327409773648, + "loss": 2.964, + "mean_token_accuracy": 0.408679336309433, + "num_tokens": 6418682530.0, + "step": 12555 + }, + { + "epoch": 3.395348837209302, + "grad_norm": 2.328125, + "learning_rate": 0.006440968082585111, + "loss": 3.0495, + "mean_token_accuracy": 0.41648367047309875, + "num_tokens": 6419206651.0, + "step": 12556 + }, + { + "epoch": 3.395619253650622, + "grad_norm": 2.640625, + "learning_rate": 0.00643960889528108, + "loss": 3.0899, + "mean_token_accuracy": 0.4067609906196594, + "num_tokens": 6419730888.0, + "step": 12557 + }, + { + "epoch": 3.3958896700919414, + "grad_norm": 3.421875, + "learning_rate": 0.006438249847903257, + "loss": 2.8664, + "mean_token_accuracy": 0.4333876967430115, + "num_tokens": 6420255150.0, + "step": 12558 + }, + { + "epoch": 3.396160086533261, + "grad_norm": 2.828125, + "learning_rate": 0.006436890940493345, + "loss": 2.98, + "mean_token_accuracy": 0.4199994206428528, + "num_tokens": 6420779264.0, + "step": 12559 + }, + { + "epoch": 3.3964305029745807, + "grad_norm": 3.5625, + "learning_rate": 0.00643553217309304, + "loss": 2.9871, + "mean_token_accuracy": 0.4215767979621887, + "num_tokens": 6421303498.0, + "step": 12560 + }, + { + "epoch": 3.3967009194159004, + "grad_norm": 3.15625, + "learning_rate": 0.006434173545744029, + "loss": 3.0093, + "mean_token_accuracy": 0.4136248528957367, + "num_tokens": 6421827578.0, + "step": 12561 + }, + { + "epoch": 3.39697133585722, + "grad_norm": 2.71875, + "learning_rate": 0.006432815058487999, + "loss": 2.9884, + "mean_token_accuracy": 0.39788466691970825, + "num_tokens": 6422351861.0, + "step": 12562 + }, + { + "epoch": 3.3972417522985396, + "grad_norm": 3.0625, + "learning_rate": 0.006431456711366637, + "loss": 3.011, + "mean_token_accuracy": 0.4102770686149597, + "num_tokens": 6422876130.0, + "step": 12563 + }, + { + "epoch": 3.3975121687398593, + "grad_norm": 2.609375, + "learning_rate": 0.006430098504421615, + "loss": 2.9146, + "mean_token_accuracy": 0.4221068024635315, + "num_tokens": 6423400297.0, + "step": 12564 + }, + { + "epoch": 3.397782585181179, + "grad_norm": 2.765625, + "learning_rate": 0.006428740437694615, + "loss": 2.7374, + "mean_token_accuracy": 0.4405785799026489, + "num_tokens": 6423881272.0, + "step": 12565 + }, + { + "epoch": 3.3980530016224986, + "grad_norm": 2.71875, + "learning_rate": 0.006427382511227298, + "loss": 2.9054, + "mean_token_accuracy": 0.42275291681289673, + "num_tokens": 6424405479.0, + "step": 12566 + }, + { + "epoch": 3.398323418063818, + "grad_norm": 2.859375, + "learning_rate": 0.006426024725061335, + "loss": 3.1404, + "mean_token_accuracy": 0.39453399181365967, + "num_tokens": 6424929739.0, + "step": 12567 + }, + { + "epoch": 3.398593834505138, + "grad_norm": 2.59375, + "learning_rate": 0.006424667079238387, + "loss": 2.7272, + "mean_token_accuracy": 0.4226359724998474, + "num_tokens": 6425453840.0, + "step": 12568 + }, + { + "epoch": 3.3988642509464575, + "grad_norm": 3.359375, + "learning_rate": 0.006423309573800102, + "loss": 3.0253, + "mean_token_accuracy": 0.42173656821250916, + "num_tokens": 6425978099.0, + "step": 12569 + }, + { + "epoch": 3.399134667387777, + "grad_norm": 3.390625, + "learning_rate": 0.006421952208788143, + "loss": 3.0296, + "mean_token_accuracy": 0.4208643436431885, + "num_tokens": 6426453976.0, + "step": 12570 + }, + { + "epoch": 3.399405083829097, + "grad_norm": 99.0, + "learning_rate": 0.006420594984244152, + "loss": 19.8928, + "mean_token_accuracy": 0.0, + "num_tokens": 6426978132.0, + "step": 12571 + }, + { + "epoch": 3.3996755002704164, + "grad_norm": 6.8125, + "learning_rate": 0.006419237900209773, + "loss": 3.0013, + "mean_token_accuracy": 0.41327881813049316, + "num_tokens": 6427502103.0, + "step": 12572 + }, + { + "epoch": 3.399945916711736, + "grad_norm": 6.0, + "learning_rate": 0.006417880956726649, + "loss": 2.7421, + "mean_token_accuracy": 0.4473399519920349, + "num_tokens": 6428026340.0, + "step": 12573 + }, + { + "epoch": 3.4002163331530557, + "grad_norm": 2.75, + "learning_rate": 0.006416524153836411, + "loss": 2.8876, + "mean_token_accuracy": 0.3936317265033722, + "num_tokens": 6428550574.0, + "step": 12574 + }, + { + "epoch": 3.4004867495943754, + "grad_norm": 2.265625, + "learning_rate": 0.0064151674915806905, + "loss": 2.9888, + "mean_token_accuracy": 0.40602004528045654, + "num_tokens": 6429074832.0, + "step": 12575 + }, + { + "epoch": 3.400757166035695, + "grad_norm": 2.875, + "learning_rate": 0.006413810970001115, + "loss": 2.808, + "mean_token_accuracy": 0.43479472398757935, + "num_tokens": 6429537031.0, + "step": 12576 + }, + { + "epoch": 3.4010275824770146, + "grad_norm": 4.28125, + "learning_rate": 0.006412454589139304, + "loss": 2.6824, + "mean_token_accuracy": 0.4690241813659668, + "num_tokens": 6430061256.0, + "step": 12577 + }, + { + "epoch": 3.4012979989183343, + "grad_norm": 2.734375, + "learning_rate": 0.006411098349036877, + "loss": 2.9609, + "mean_token_accuracy": 0.4267539083957672, + "num_tokens": 6430585480.0, + "step": 12578 + }, + { + "epoch": 3.401568415359654, + "grad_norm": 4.25, + "learning_rate": 0.006409742249735451, + "loss": 2.934, + "mean_token_accuracy": 0.40003901720046997, + "num_tokens": 6431109724.0, + "step": 12579 + }, + { + "epoch": 3.4018388318009736, + "grad_norm": 2.578125, + "learning_rate": 0.0064083862912766324, + "loss": 2.9426, + "mean_token_accuracy": 0.43475162982940674, + "num_tokens": 6431607209.0, + "step": 12580 + }, + { + "epoch": 3.402109248242293, + "grad_norm": 6.59375, + "learning_rate": 0.006407030473702021, + "loss": 2.7575, + "mean_token_accuracy": 0.4594559371471405, + "num_tokens": 6432131475.0, + "step": 12581 + }, + { + "epoch": 3.402379664683613, + "grad_norm": 1.6015625, + "learning_rate": 0.006405674797053227, + "loss": 3.1265, + "mean_token_accuracy": 0.4158482551574707, + "num_tokens": 6432655695.0, + "step": 12582 + }, + { + "epoch": 3.4026500811249325, + "grad_norm": 2.59375, + "learning_rate": 0.006404319261371839, + "loss": 2.7838, + "mean_token_accuracy": 0.4288201630115509, + "num_tokens": 6433179904.0, + "step": 12583 + }, + { + "epoch": 3.402920497566252, + "grad_norm": 2.46875, + "learning_rate": 0.006402963866699449, + "loss": 3.0862, + "mean_token_accuracy": 0.4190153479576111, + "num_tokens": 6433704049.0, + "step": 12584 + }, + { + "epoch": 3.403190914007572, + "grad_norm": 3.21875, + "learning_rate": 0.00640160861307765, + "loss": 3.041, + "mean_token_accuracy": 0.4031175971031189, + "num_tokens": 6434228331.0, + "step": 12585 + }, + { + "epoch": 3.4034613304488914, + "grad_norm": 2.5625, + "learning_rate": 0.006400253500548018, + "loss": 2.8281, + "mean_token_accuracy": 0.42446160316467285, + "num_tokens": 6434752513.0, + "step": 12586 + }, + { + "epoch": 3.403731746890211, + "grad_norm": 3.421875, + "learning_rate": 0.00639889852915214, + "loss": 2.9757, + "mean_token_accuracy": 0.42304056882858276, + "num_tokens": 6435276719.0, + "step": 12587 + }, + { + "epoch": 3.4040021633315307, + "grad_norm": 3.140625, + "learning_rate": 0.006397543698931585, + "loss": 2.9047, + "mean_token_accuracy": 0.44535958766937256, + "num_tokens": 6435798909.0, + "step": 12588 + }, + { + "epoch": 3.4042725797728504, + "grad_norm": 2.921875, + "learning_rate": 0.00639618900992792, + "loss": 2.9992, + "mean_token_accuracy": 0.4222469925880432, + "num_tokens": 6436323114.0, + "step": 12589 + }, + { + "epoch": 3.40454299621417, + "grad_norm": 3.3125, + "learning_rate": 0.00639483446218272, + "loss": 3.0352, + "mean_token_accuracy": 0.4036567211151123, + "num_tokens": 6436841900.0, + "step": 12590 + }, + { + "epoch": 3.4048134126554896, + "grad_norm": 18.625, + "learning_rate": 0.006393480055737537, + "loss": 10.986, + "mean_token_accuracy": 0.00037160335341468453, + "num_tokens": 6437335549.0, + "step": 12591 + }, + { + "epoch": 3.4050838290968093, + "grad_norm": 6.09375, + "learning_rate": 0.006392125790633938, + "loss": 3.0904, + "mean_token_accuracy": 0.4002453088760376, + "num_tokens": 6437859814.0, + "step": 12592 + }, + { + "epoch": 3.4053542455381285, + "grad_norm": 2.5625, + "learning_rate": 0.006390771666913465, + "loss": 3.0937, + "mean_token_accuracy": 0.41340190172195435, + "num_tokens": 6438359355.0, + "step": 12593 + }, + { + "epoch": 3.4056246619794486, + "grad_norm": 2.46875, + "learning_rate": 0.006389417684617677, + "loss": 2.9668, + "mean_token_accuracy": 0.4337637424468994, + "num_tokens": 6438824148.0, + "step": 12594 + }, + { + "epoch": 3.4058950784207678, + "grad_norm": 3.96875, + "learning_rate": 0.00638806384378811, + "loss": 3.1334, + "mean_token_accuracy": 0.42183420062065125, + "num_tokens": 6439315191.0, + "step": 12595 + }, + { + "epoch": 3.406165494862088, + "grad_norm": 2.984375, + "learning_rate": 0.00638671014446631, + "loss": 2.8794, + "mean_token_accuracy": 0.4153827428817749, + "num_tokens": 6439839455.0, + "step": 12596 + }, + { + "epoch": 3.406435911303407, + "grad_norm": 2.828125, + "learning_rate": 0.006385356586693809, + "loss": 2.8792, + "mean_token_accuracy": 0.43852925300598145, + "num_tokens": 6440354297.0, + "step": 12597 + }, + { + "epoch": 3.4067063277447267, + "grad_norm": 2.953125, + "learning_rate": 0.006384003170512136, + "loss": 3.0446, + "mean_token_accuracy": 0.4017540216445923, + "num_tokens": 6440850224.0, + "step": 12598 + }, + { + "epoch": 3.4069767441860463, + "grad_norm": 2.34375, + "learning_rate": 0.0063826498959628246, + "loss": 2.8744, + "mean_token_accuracy": 0.4286876916885376, + "num_tokens": 6441374485.0, + "step": 12599 + }, + { + "epoch": 3.407247160627366, + "grad_norm": 2.671875, + "learning_rate": 0.006381296763087392, + "loss": 2.838, + "mean_token_accuracy": 0.42169642448425293, + "num_tokens": 6441898657.0, + "step": 12600 + }, + { + "epoch": 3.4075175770686856, + "grad_norm": 2.578125, + "learning_rate": 0.0063799437719273535, + "loss": 2.8538, + "mean_token_accuracy": 0.43126538395881653, + "num_tokens": 6442422684.0, + "step": 12601 + }, + { + "epoch": 3.4077879935100053, + "grad_norm": 2.609375, + "learning_rate": 0.0063785909225242314, + "loss": 2.895, + "mean_token_accuracy": 0.4572368860244751, + "num_tokens": 6442946890.0, + "step": 12602 + }, + { + "epoch": 3.408058409951325, + "grad_norm": 3.5625, + "learning_rate": 0.00637723821491953, + "loss": 3.1543, + "mean_token_accuracy": 0.40966796875, + "num_tokens": 6443442573.0, + "step": 12603 + }, + { + "epoch": 3.4083288263926446, + "grad_norm": 4.25, + "learning_rate": 0.006375885649154751, + "loss": 2.8968, + "mean_token_accuracy": 0.42371612787246704, + "num_tokens": 6443966767.0, + "step": 12604 + }, + { + "epoch": 3.408599242833964, + "grad_norm": 3.5, + "learning_rate": 0.0063745332252713996, + "loss": 2.6977, + "mean_token_accuracy": 0.4278118908405304, + "num_tokens": 6444465192.0, + "step": 12605 + }, + { + "epoch": 3.408869659275284, + "grad_norm": 2.53125, + "learning_rate": 0.0063731809433109745, + "loss": 2.7774, + "mean_token_accuracy": 0.44462448358535767, + "num_tokens": 6444981862.0, + "step": 12606 + }, + { + "epoch": 3.4091400757166035, + "grad_norm": 2.9375, + "learning_rate": 0.006371828803314962, + "loss": 2.9406, + "mean_token_accuracy": 0.4230233430862427, + "num_tokens": 6445506014.0, + "step": 12607 + }, + { + "epoch": 3.409410492157923, + "grad_norm": 2.578125, + "learning_rate": 0.0063704768053248545, + "loss": 2.944, + "mean_token_accuracy": 0.4359935522079468, + "num_tokens": 6445939896.0, + "step": 12608 + }, + { + "epoch": 3.4096809085992428, + "grad_norm": 3.234375, + "learning_rate": 0.0063691249493821305, + "loss": 3.0588, + "mean_token_accuracy": 0.42076611518859863, + "num_tokens": 6446464037.0, + "step": 12609 + }, + { + "epoch": 3.4099513250405624, + "grad_norm": 2.6875, + "learning_rate": 0.006367773235528276, + "loss": 3.0821, + "mean_token_accuracy": 0.4413776397705078, + "num_tokens": 6446924050.0, + "step": 12610 + }, + { + "epoch": 3.410221741481882, + "grad_norm": 84.5, + "learning_rate": 0.00636642166380476, + "loss": 12.0246, + "mean_token_accuracy": 0.011222358793020248, + "num_tokens": 6447448330.0, + "step": 12611 + }, + { + "epoch": 3.4104921579232017, + "grad_norm": 6.875, + "learning_rate": 0.00636507023425305, + "loss": 3.0101, + "mean_token_accuracy": 0.40399542450904846, + "num_tokens": 6447972503.0, + "step": 12612 + }, + { + "epoch": 3.4107625743645213, + "grad_norm": 2.171875, + "learning_rate": 0.006363718946914619, + "loss": 2.8883, + "mean_token_accuracy": 0.4048560559749603, + "num_tokens": 6448496680.0, + "step": 12613 + }, + { + "epoch": 3.411032990805841, + "grad_norm": 2.375, + "learning_rate": 0.006362367801830926, + "loss": 2.9653, + "mean_token_accuracy": 0.3983455300331116, + "num_tokens": 6449020872.0, + "step": 12614 + }, + { + "epoch": 3.4113034072471606, + "grad_norm": 3.5, + "learning_rate": 0.006361016799043423, + "loss": 2.8642, + "mean_token_accuracy": 0.42968857288360596, + "num_tokens": 6449488598.0, + "step": 12615 + }, + { + "epoch": 3.4115738236884803, + "grad_norm": 3.21875, + "learning_rate": 0.0063596659385935705, + "loss": 2.8745, + "mean_token_accuracy": 0.43850177526474, + "num_tokens": 6449948603.0, + "step": 12616 + }, + { + "epoch": 3.4118442401298, + "grad_norm": 2.78125, + "learning_rate": 0.006358315220522814, + "loss": 2.8958, + "mean_token_accuracy": 0.4191286265850067, + "num_tokens": 6450472883.0, + "step": 12617 + }, + { + "epoch": 3.4121146565711196, + "grad_norm": 3.09375, + "learning_rate": 0.006356964644872593, + "loss": 3.0472, + "mean_token_accuracy": 0.4167897701263428, + "num_tokens": 6450997107.0, + "step": 12618 + }, + { + "epoch": 3.412385073012439, + "grad_norm": 3.09375, + "learning_rate": 0.006355614211684355, + "loss": 2.9911, + "mean_token_accuracy": 0.4059852957725525, + "num_tokens": 6451521347.0, + "step": 12619 + }, + { + "epoch": 3.412655489453759, + "grad_norm": 21.375, + "learning_rate": 0.006354263920999526, + "loss": 3.3964, + "mean_token_accuracy": 0.37810102105140686, + "num_tokens": 6452045596.0, + "step": 12620 + }, + { + "epoch": 3.4129259058950785, + "grad_norm": 4.1875, + "learning_rate": 0.006352913772859545, + "loss": 2.8839, + "mean_token_accuracy": 0.43583929538726807, + "num_tokens": 6452463396.0, + "step": 12621 + }, + { + "epoch": 3.413196322336398, + "grad_norm": 2.078125, + "learning_rate": 0.006351563767305837, + "loss": 3.1143, + "mean_token_accuracy": 0.4112092852592468, + "num_tokens": 6452971588.0, + "step": 12622 + }, + { + "epoch": 3.413466738777718, + "grad_norm": 2.59375, + "learning_rate": 0.006350213904379823, + "loss": 2.7707, + "mean_token_accuracy": 0.4462761878967285, + "num_tokens": 6453495833.0, + "step": 12623 + }, + { + "epoch": 3.4137371552190374, + "grad_norm": 2.828125, + "learning_rate": 0.0063488641841229175, + "loss": 3.0779, + "mean_token_accuracy": 0.4093206524848938, + "num_tokens": 6454020008.0, + "step": 12624 + }, + { + "epoch": 3.414007571660357, + "grad_norm": 3.0, + "learning_rate": 0.006347514606576541, + "loss": 3.0607, + "mean_token_accuracy": 0.4073031544685364, + "num_tokens": 6454544273.0, + "step": 12625 + }, + { + "epoch": 3.4142779881016767, + "grad_norm": 3.21875, + "learning_rate": 0.006346165171782096, + "loss": 2.6735, + "mean_token_accuracy": 0.4450805187225342, + "num_tokens": 6455040025.0, + "step": 12626 + }, + { + "epoch": 3.4145484045429964, + "grad_norm": 2.859375, + "learning_rate": 0.0063448158797809896, + "loss": 2.9264, + "mean_token_accuracy": 0.44049978256225586, + "num_tokens": 6455502492.0, + "step": 12627 + }, + { + "epoch": 3.414818820984316, + "grad_norm": 18.625, + "learning_rate": 0.0063434667306146225, + "loss": 2.8718, + "mean_token_accuracy": 0.45119258761405945, + "num_tokens": 6456026695.0, + "step": 12628 + }, + { + "epoch": 3.4150892374256356, + "grad_norm": 3.765625, + "learning_rate": 0.006342117724324391, + "loss": 3.0249, + "mean_token_accuracy": 0.42057210206985474, + "num_tokens": 6456541944.0, + "step": 12629 + }, + { + "epoch": 3.4153596538669553, + "grad_norm": 2.78125, + "learning_rate": 0.006340768860951684, + "loss": 3.0026, + "mean_token_accuracy": 0.4196881055831909, + "num_tokens": 6457066170.0, + "step": 12630 + }, + { + "epoch": 3.415630070308275, + "grad_norm": 14.375, + "learning_rate": 0.006339420140537892, + "loss": 10.9953, + "mean_token_accuracy": 0.005629941821098328, + "num_tokens": 6457590409.0, + "step": 12631 + }, + { + "epoch": 3.4159004867495946, + "grad_norm": 5.84375, + "learning_rate": 0.006338071563124392, + "loss": 3.1033, + "mean_token_accuracy": 0.41262245178222656, + "num_tokens": 6458056783.0, + "step": 12632 + }, + { + "epoch": 3.416170903190914, + "grad_norm": 2.3125, + "learning_rate": 0.006336723128752571, + "loss": 2.8851, + "mean_token_accuracy": 0.3879154920578003, + "num_tokens": 6458580888.0, + "step": 12633 + }, + { + "epoch": 3.4164413196322334, + "grad_norm": 2.390625, + "learning_rate": 0.006335374837463794, + "loss": 2.8037, + "mean_token_accuracy": 0.4157370626926422, + "num_tokens": 6459060025.0, + "step": 12634 + }, + { + "epoch": 3.4167117360735535, + "grad_norm": 3.125, + "learning_rate": 0.006334026689299435, + "loss": 2.8666, + "mean_token_accuracy": 0.42376846075057983, + "num_tokens": 6459526917.0, + "step": 12635 + }, + { + "epoch": 3.4169821525148727, + "grad_norm": 3.703125, + "learning_rate": 0.006332678684300861, + "loss": 3.1934, + "mean_token_accuracy": 0.39189785718917847, + "num_tokens": 6460051136.0, + "step": 12636 + }, + { + "epoch": 3.417252568956193, + "grad_norm": 2.703125, + "learning_rate": 0.006331330822509433, + "loss": 2.9639, + "mean_token_accuracy": 0.402337908744812, + "num_tokens": 6460575256.0, + "step": 12637 + }, + { + "epoch": 3.417522985397512, + "grad_norm": 3.203125, + "learning_rate": 0.006329983103966501, + "loss": 2.9424, + "mean_token_accuracy": 0.4097173511981964, + "num_tokens": 6461099529.0, + "step": 12638 + }, + { + "epoch": 3.4177934018388316, + "grad_norm": 2.25, + "learning_rate": 0.006328635528713422, + "loss": 2.9332, + "mean_token_accuracy": 0.4287260174751282, + "num_tokens": 6461574696.0, + "step": 12639 + }, + { + "epoch": 3.4180638182801513, + "grad_norm": 3.328125, + "learning_rate": 0.006327288096791546, + "loss": 2.8356, + "mean_token_accuracy": 0.4326517581939697, + "num_tokens": 6462075184.0, + "step": 12640 + }, + { + "epoch": 3.418334234721471, + "grad_norm": 2.5625, + "learning_rate": 0.006325940808242208, + "loss": 2.9305, + "mean_token_accuracy": 0.42617151141166687, + "num_tokens": 6462599461.0, + "step": 12641 + }, + { + "epoch": 3.4186046511627906, + "grad_norm": 3.3125, + "learning_rate": 0.006324593663106754, + "loss": 3.0102, + "mean_token_accuracy": 0.44896745681762695, + "num_tokens": 6463058185.0, + "step": 12642 + }, + { + "epoch": 3.41887506760411, + "grad_norm": 2.8125, + "learning_rate": 0.006323246661426516, + "loss": 2.6023, + "mean_token_accuracy": 0.45393437147140503, + "num_tokens": 6463523366.0, + "step": 12643 + }, + { + "epoch": 3.41914548404543, + "grad_norm": 2.28125, + "learning_rate": 0.006321899803242822, + "loss": 3.0444, + "mean_token_accuracy": 0.38004136085510254, + "num_tokens": 6464047545.0, + "step": 12644 + }, + { + "epoch": 3.4194159004867495, + "grad_norm": 4.71875, + "learning_rate": 0.006320553088597002, + "loss": 2.9656, + "mean_token_accuracy": 0.4286941885948181, + "num_tokens": 6464534266.0, + "step": 12645 + }, + { + "epoch": 3.419686316928069, + "grad_norm": 2.1875, + "learning_rate": 0.0063192065175303735, + "loss": 3.0706, + "mean_token_accuracy": 0.4118771553039551, + "num_tokens": 6465058502.0, + "step": 12646 + }, + { + "epoch": 3.4199567333693888, + "grad_norm": 3.0625, + "learning_rate": 0.006317860090084253, + "loss": 2.796, + "mean_token_accuracy": 0.4326757788658142, + "num_tokens": 6465582612.0, + "step": 12647 + }, + { + "epoch": 3.4202271498107084, + "grad_norm": 2.421875, + "learning_rate": 0.006316513806299953, + "loss": 3.0199, + "mean_token_accuracy": 0.4023563861846924, + "num_tokens": 6466106850.0, + "step": 12648 + }, + { + "epoch": 3.420497566252028, + "grad_norm": 3.109375, + "learning_rate": 0.006315167666218786, + "loss": 2.9468, + "mean_token_accuracy": 0.43861937522888184, + "num_tokens": 6466591402.0, + "step": 12649 + }, + { + "epoch": 3.4207679826933477, + "grad_norm": 2.375, + "learning_rate": 0.00631382166988205, + "loss": 2.8363, + "mean_token_accuracy": 0.4310252070426941, + "num_tokens": 6467115605.0, + "step": 12650 + }, + { + "epoch": 3.4210383991346673, + "grad_norm": 92.5, + "learning_rate": 0.006312475817331049, + "loss": 10.995, + "mean_token_accuracy": 0.035257820039987564, + "num_tokens": 6467639649.0, + "step": 12651 + }, + { + "epoch": 3.421308815575987, + "grad_norm": 5.15625, + "learning_rate": 0.006311130108607075, + "loss": 2.9953, + "mean_token_accuracy": 0.40900468826293945, + "num_tokens": 6468163906.0, + "step": 12652 + }, + { + "epoch": 3.4215792320173066, + "grad_norm": 2.53125, + "learning_rate": 0.006309784543751414, + "loss": 3.0734, + "mean_token_accuracy": 0.4210568964481354, + "num_tokens": 6468681854.0, + "step": 12653 + }, + { + "epoch": 3.4218496484586263, + "grad_norm": 4.6875, + "learning_rate": 0.00630843912280536, + "loss": 2.9594, + "mean_token_accuracy": 0.4354381561279297, + "num_tokens": 6469145865.0, + "step": 12654 + }, + { + "epoch": 3.422120064899946, + "grad_norm": 3.5, + "learning_rate": 0.00630709384581019, + "loss": 2.915, + "mean_token_accuracy": 0.4297768771648407, + "num_tokens": 6469670064.0, + "step": 12655 + }, + { + "epoch": 3.4223904813412656, + "grad_norm": 3.140625, + "learning_rate": 0.00630574871280718, + "loss": 2.9158, + "mean_token_accuracy": 0.421499639749527, + "num_tokens": 6470194245.0, + "step": 12656 + }, + { + "epoch": 3.422660897782585, + "grad_norm": 3.359375, + "learning_rate": 0.006304403723837608, + "loss": 2.8995, + "mean_token_accuracy": 0.40248849987983704, + "num_tokens": 6470718429.0, + "step": 12657 + }, + { + "epoch": 3.422931314223905, + "grad_norm": 2.75, + "learning_rate": 0.006303058878942733, + "loss": 3.0081, + "mean_token_accuracy": 0.41515251994132996, + "num_tokens": 6471242642.0, + "step": 12658 + }, + { + "epoch": 3.4232017306652245, + "grad_norm": 2.828125, + "learning_rate": 0.006301714178163831, + "loss": 2.9087, + "mean_token_accuracy": 0.4261350631713867, + "num_tokens": 6471766655.0, + "step": 12659 + }, + { + "epoch": 3.423472147106544, + "grad_norm": 2.640625, + "learning_rate": 0.006300369621542152, + "loss": 2.8795, + "mean_token_accuracy": 0.42400646209716797, + "num_tokens": 6472289337.0, + "step": 12660 + }, + { + "epoch": 3.4237425635478638, + "grad_norm": 3.296875, + "learning_rate": 0.006299025209118951, + "loss": 2.9015, + "mean_token_accuracy": 0.39317744970321655, + "num_tokens": 6472813479.0, + "step": 12661 + }, + { + "epoch": 3.4240129799891834, + "grad_norm": 2.25, + "learning_rate": 0.006297680940935482, + "loss": 2.9805, + "mean_token_accuracy": 0.4398694634437561, + "num_tokens": 6473287755.0, + "step": 12662 + }, + { + "epoch": 3.424283396430503, + "grad_norm": 2.828125, + "learning_rate": 0.006296336817032995, + "loss": 3.0598, + "mean_token_accuracy": 0.42249375581741333, + "num_tokens": 6473810797.0, + "step": 12663 + }, + { + "epoch": 3.4245538128718227, + "grad_norm": 3.21875, + "learning_rate": 0.006294992837452721, + "loss": 2.9369, + "mean_token_accuracy": 0.4139571487903595, + "num_tokens": 6474334847.0, + "step": 12664 + }, + { + "epoch": 3.4248242293131423, + "grad_norm": 3.359375, + "learning_rate": 0.006293649002235907, + "loss": 2.8542, + "mean_token_accuracy": 0.4221915006637573, + "num_tokens": 6474859077.0, + "step": 12665 + }, + { + "epoch": 3.425094645754462, + "grad_norm": 3.359375, + "learning_rate": 0.006292305311423782, + "loss": 2.9005, + "mean_token_accuracy": 0.4270690083503723, + "num_tokens": 6475361403.0, + "step": 12666 + }, + { + "epoch": 3.4253650621957816, + "grad_norm": 2.890625, + "learning_rate": 0.006290961765057572, + "loss": 3.0347, + "mean_token_accuracy": 0.4407932460308075, + "num_tokens": 6475820474.0, + "step": 12667 + }, + { + "epoch": 3.4256354786371013, + "grad_norm": 3.703125, + "learning_rate": 0.0062896183631785075, + "loss": 3.1156, + "mean_token_accuracy": 0.4059416651725769, + "num_tokens": 6476344628.0, + "step": 12668 + }, + { + "epoch": 3.425905895078421, + "grad_norm": 3.109375, + "learning_rate": 0.006288275105827803, + "loss": 2.8828, + "mean_token_accuracy": 0.4295978844165802, + "num_tokens": 6476868731.0, + "step": 12669 + }, + { + "epoch": 3.4261763115197406, + "grad_norm": 3.21875, + "learning_rate": 0.006286931993046672, + "loss": 2.9976, + "mean_token_accuracy": 0.4284667372703552, + "num_tokens": 6477382529.0, + "step": 12670 + }, + { + "epoch": 3.42644672796106, + "grad_norm": 516.0, + "learning_rate": 0.006285589024876328, + "loss": 15.9986, + "mean_token_accuracy": 2.9198785341577604e-05, + "num_tokens": 6477906640.0, + "step": 12671 + }, + { + "epoch": 3.42671714440238, + "grad_norm": 8.1875, + "learning_rate": 0.006284246201357979, + "loss": 3.0908, + "mean_token_accuracy": 0.3974427282810211, + "num_tokens": 6478430917.0, + "step": 12672 + }, + { + "epoch": 3.4269875608436995, + "grad_norm": 3.40625, + "learning_rate": 0.006282903522532822, + "loss": 2.9923, + "mean_token_accuracy": 0.43401890993118286, + "num_tokens": 6478955107.0, + "step": 12673 + }, + { + "epoch": 3.427257977285019, + "grad_norm": 2.625, + "learning_rate": 0.006281560988442061, + "loss": 2.972, + "mean_token_accuracy": 0.4247190058231354, + "num_tokens": 6479479366.0, + "step": 12674 + }, + { + "epoch": 3.4275283937263383, + "grad_norm": 2.625, + "learning_rate": 0.00628021859912688, + "loss": 3.0103, + "mean_token_accuracy": 0.3941999077796936, + "num_tokens": 6480003516.0, + "step": 12675 + }, + { + "epoch": 3.4277988101676584, + "grad_norm": 3.6875, + "learning_rate": 0.006278876354628475, + "loss": 3.0284, + "mean_token_accuracy": 0.40974175930023193, + "num_tokens": 6480527665.0, + "step": 12676 + }, + { + "epoch": 3.4280692266089776, + "grad_norm": 2.28125, + "learning_rate": 0.006277534254988025, + "loss": 2.932, + "mean_token_accuracy": 0.4272904694080353, + "num_tokens": 6481017528.0, + "step": 12677 + }, + { + "epoch": 3.4283396430502977, + "grad_norm": 2.859375, + "learning_rate": 0.006276192300246716, + "loss": 2.7512, + "mean_token_accuracy": 0.4447450041770935, + "num_tokens": 6481541787.0, + "step": 12678 + }, + { + "epoch": 3.428610059491617, + "grad_norm": 3.34375, + "learning_rate": 0.006274850490445716, + "loss": 2.8852, + "mean_token_accuracy": 0.423042893409729, + "num_tokens": 6482065878.0, + "step": 12679 + }, + { + "epoch": 3.4288804759329365, + "grad_norm": 2.84375, + "learning_rate": 0.0062735088256262, + "loss": 2.7164, + "mean_token_accuracy": 0.4523962140083313, + "num_tokens": 6482533812.0, + "step": 12680 + }, + { + "epoch": 3.429150892374256, + "grad_norm": 2.625, + "learning_rate": 0.00627216730582933, + "loss": 3.0395, + "mean_token_accuracy": 0.42133277654647827, + "num_tokens": 6483057509.0, + "step": 12681 + }, + { + "epoch": 3.429421308815576, + "grad_norm": 4.09375, + "learning_rate": 0.0062708259310962754, + "loss": 2.9287, + "mean_token_accuracy": 0.4147017002105713, + "num_tokens": 6483581606.0, + "step": 12682 + }, + { + "epoch": 3.4296917252568955, + "grad_norm": 3.78125, + "learning_rate": 0.006269484701468189, + "loss": 3.2358, + "mean_token_accuracy": 0.3972983658313751, + "num_tokens": 6484105858.0, + "step": 12683 + }, + { + "epoch": 3.429962141698215, + "grad_norm": 3.625, + "learning_rate": 0.006268143616986219, + "loss": 3.0649, + "mean_token_accuracy": 0.41092148423194885, + "num_tokens": 6484629993.0, + "step": 12684 + }, + { + "epoch": 3.4302325581395348, + "grad_norm": 2.921875, + "learning_rate": 0.006266802677691522, + "loss": 2.9367, + "mean_token_accuracy": 0.44043439626693726, + "num_tokens": 6485154234.0, + "step": 12685 + }, + { + "epoch": 3.4305029745808544, + "grad_norm": 2.8125, + "learning_rate": 0.006265461883625239, + "loss": 2.9647, + "mean_token_accuracy": 0.42397475242614746, + "num_tokens": 6485637073.0, + "step": 12686 + }, + { + "epoch": 3.430773391022174, + "grad_norm": 2.5625, + "learning_rate": 0.006264121234828504, + "loss": 3.0148, + "mean_token_accuracy": 0.4480801820755005, + "num_tokens": 6486088018.0, + "step": 12687 + }, + { + "epoch": 3.4310438074634937, + "grad_norm": 3.15625, + "learning_rate": 0.006262780731342461, + "loss": 3.0105, + "mean_token_accuracy": 0.42267587780952454, + "num_tokens": 6486587351.0, + "step": 12688 + }, + { + "epoch": 3.4313142239048133, + "grad_norm": 3.046875, + "learning_rate": 0.006261440373208236, + "loss": 2.6774, + "mean_token_accuracy": 0.45886707305908203, + "num_tokens": 6487111614.0, + "step": 12689 + }, + { + "epoch": 3.431584640346133, + "grad_norm": 3.15625, + "learning_rate": 0.0062601001604669524, + "loss": 2.9325, + "mean_token_accuracy": 0.45113542675971985, + "num_tokens": 6487545491.0, + "step": 12690 + }, + { + "epoch": 3.4318550567874526, + "grad_norm": 12.25, + "learning_rate": 0.0062587600931597355, + "loss": 9.4961, + "mean_token_accuracy": 0.012317905202507973, + "num_tokens": 6488069587.0, + "step": 12691 + }, + { + "epoch": 3.4321254732287723, + "grad_norm": 7.125, + "learning_rate": 0.006257420171327706, + "loss": 3.0808, + "mean_token_accuracy": 0.39795950055122375, + "num_tokens": 6488593764.0, + "step": 12692 + }, + { + "epoch": 3.432395889670092, + "grad_norm": 2.234375, + "learning_rate": 0.006256080395011967, + "loss": 2.8124, + "mean_token_accuracy": 0.4346334934234619, + "num_tokens": 6489118022.0, + "step": 12693 + }, + { + "epoch": 3.4326663061114115, + "grad_norm": 2.984375, + "learning_rate": 0.006254740764253637, + "loss": 3.061, + "mean_token_accuracy": 0.40153488516807556, + "num_tokens": 6489642124.0, + "step": 12694 + }, + { + "epoch": 3.432936722552731, + "grad_norm": 3.296875, + "learning_rate": 0.006253401279093815, + "loss": 3.0368, + "mean_token_accuracy": 0.42042988538742065, + "num_tokens": 6490166404.0, + "step": 12695 + }, + { + "epoch": 3.433207138994051, + "grad_norm": 3.484375, + "learning_rate": 0.006252061939573597, + "loss": 3.084, + "mean_token_accuracy": 0.4298901855945587, + "num_tokens": 6490643029.0, + "step": 12696 + }, + { + "epoch": 3.4334775554353705, + "grad_norm": 3.046875, + "learning_rate": 0.006250722745734086, + "loss": 2.9441, + "mean_token_accuracy": 0.431293785572052, + "num_tokens": 6491123643.0, + "step": 12697 + }, + { + "epoch": 3.43374797187669, + "grad_norm": 3.125, + "learning_rate": 0.006249383697616366, + "loss": 3.0157, + "mean_token_accuracy": 0.4069260358810425, + "num_tokens": 6491647914.0, + "step": 12698 + }, + { + "epoch": 3.4340183883180098, + "grad_norm": 3.015625, + "learning_rate": 0.0062480447952615245, + "loss": 2.8895, + "mean_token_accuracy": 0.41207385063171387, + "num_tokens": 6492172187.0, + "step": 12699 + }, + { + "epoch": 3.4342888047593294, + "grad_norm": 2.9375, + "learning_rate": 0.0062467060387106435, + "loss": 3.012, + "mean_token_accuracy": 0.4011572599411011, + "num_tokens": 6492696471.0, + "step": 12700 + }, + { + "epoch": 3.434559221200649, + "grad_norm": 3.328125, + "learning_rate": 0.0062453674280048025, + "loss": 3.0547, + "mean_token_accuracy": 0.42836999893188477, + "num_tokens": 6493156347.0, + "step": 12701 + }, + { + "epoch": 3.4348296376419687, + "grad_norm": 3.4375, + "learning_rate": 0.006244028963185067, + "loss": 2.9817, + "mean_token_accuracy": 0.39170166850090027, + "num_tokens": 6493680610.0, + "step": 12702 + }, + { + "epoch": 3.4351000540832883, + "grad_norm": 3.453125, + "learning_rate": 0.006242690644292512, + "loss": 2.9242, + "mean_token_accuracy": 0.42838484048843384, + "num_tokens": 6494187843.0, + "step": 12703 + }, + { + "epoch": 3.435370470524608, + "grad_norm": 3.109375, + "learning_rate": 0.006241352471368195, + "loss": 2.9544, + "mean_token_accuracy": 0.418740451335907, + "num_tokens": 6494712069.0, + "step": 12704 + }, + { + "epoch": 3.4356408869659276, + "grad_norm": 2.859375, + "learning_rate": 0.006240014444453178, + "loss": 2.8382, + "mean_token_accuracy": 0.4411354959011078, + "num_tokens": 6495236177.0, + "step": 12705 + }, + { + "epoch": 3.4359113034072473, + "grad_norm": 2.953125, + "learning_rate": 0.006238676563588519, + "loss": 2.854, + "mean_token_accuracy": 0.42715179920196533, + "num_tokens": 6495760211.0, + "step": 12706 + }, + { + "epoch": 3.436181719848567, + "grad_norm": 3.28125, + "learning_rate": 0.006237338828815264, + "loss": 2.7536, + "mean_token_accuracy": 0.432023286819458, + "num_tokens": 6496284456.0, + "step": 12707 + }, + { + "epoch": 3.4364521362898865, + "grad_norm": 2.875, + "learning_rate": 0.006236001240174462, + "loss": 2.7954, + "mean_token_accuracy": 0.42254638671875, + "num_tokens": 6496808714.0, + "step": 12708 + }, + { + "epoch": 3.436722552731206, + "grad_norm": 2.890625, + "learning_rate": 0.006234663797707152, + "loss": 3.0159, + "mean_token_accuracy": 0.42615610361099243, + "num_tokens": 6497332984.0, + "step": 12709 + }, + { + "epoch": 3.436992969172526, + "grad_norm": 2.96875, + "learning_rate": 0.006233326501454366, + "loss": 2.9912, + "mean_token_accuracy": 0.3955504596233368, + "num_tokens": 6497857234.0, + "step": 12710 + }, + { + "epoch": 3.4372633856138455, + "grad_norm": 15.6875, + "learning_rate": 0.006231989351457147, + "loss": 12.2196, + "mean_token_accuracy": 0.013014226220548153, + "num_tokens": 6498381490.0, + "step": 12711 + }, + { + "epoch": 3.437533802055165, + "grad_norm": 5.96875, + "learning_rate": 0.006230652347756514, + "loss": 2.9887, + "mean_token_accuracy": 0.40610140562057495, + "num_tokens": 6498905631.0, + "step": 12712 + }, + { + "epoch": 3.4378042184964848, + "grad_norm": 2.015625, + "learning_rate": 0.00622931549039349, + "loss": 2.5894, + "mean_token_accuracy": 0.462157279253006, + "num_tokens": 6499330516.0, + "step": 12713 + }, + { + "epoch": 3.4380746349378044, + "grad_norm": 3.046875, + "learning_rate": 0.0062279787794091, + "loss": 2.9336, + "mean_token_accuracy": 0.41402941942214966, + "num_tokens": 6499854719.0, + "step": 12714 + }, + { + "epoch": 3.438345051379124, + "grad_norm": 3.203125, + "learning_rate": 0.006226642214844356, + "loss": 3.0959, + "mean_token_accuracy": 0.39667809009552, + "num_tokens": 6500378897.0, + "step": 12715 + }, + { + "epoch": 3.4386154678204432, + "grad_norm": 2.65625, + "learning_rate": 0.006225305796740262, + "loss": 3.0007, + "mean_token_accuracy": 0.4096633195877075, + "num_tokens": 6500903055.0, + "step": 12716 + }, + { + "epoch": 3.4388858842617633, + "grad_norm": 3.046875, + "learning_rate": 0.006223969525137829, + "loss": 2.6618, + "mean_token_accuracy": 0.4385690689086914, + "num_tokens": 6501427321.0, + "step": 12717 + }, + { + "epoch": 3.4391563007030825, + "grad_norm": 2.921875, + "learning_rate": 0.006222633400078056, + "loss": 2.9172, + "mean_token_accuracy": 0.42452841997146606, + "num_tokens": 6501933008.0, + "step": 12718 + }, + { + "epoch": 3.4394267171444026, + "grad_norm": 3.046875, + "learning_rate": 0.0062212974216019415, + "loss": 3.1777, + "mean_token_accuracy": 0.40236836671829224, + "num_tokens": 6502375292.0, + "step": 12719 + }, + { + "epoch": 3.439697133585722, + "grad_norm": 3.296875, + "learning_rate": 0.006219961589750471, + "loss": 2.7121, + "mean_token_accuracy": 0.4366459250450134, + "num_tokens": 6502878536.0, + "step": 12720 + }, + { + "epoch": 3.4399675500270415, + "grad_norm": 2.796875, + "learning_rate": 0.006218625904564641, + "loss": 2.815, + "mean_token_accuracy": 0.4290899336338043, + "num_tokens": 6503402752.0, + "step": 12721 + }, + { + "epoch": 3.440237966468361, + "grad_norm": 2.640625, + "learning_rate": 0.006217290366085426, + "loss": 2.9497, + "mean_token_accuracy": 0.4328230023384094, + "num_tokens": 6503926972.0, + "step": 12722 + }, + { + "epoch": 3.4405083829096808, + "grad_norm": 2.875, + "learning_rate": 0.00621595497435381, + "loss": 3.0084, + "mean_token_accuracy": 0.42525872588157654, + "num_tokens": 6504451114.0, + "step": 12723 + }, + { + "epoch": 3.4407787993510004, + "grad_norm": 3.78125, + "learning_rate": 0.0062146197294107655, + "loss": 3.0098, + "mean_token_accuracy": 0.41709214448928833, + "num_tokens": 6504975313.0, + "step": 12724 + }, + { + "epoch": 3.44104921579232, + "grad_norm": 2.609375, + "learning_rate": 0.006213284631297256, + "loss": 2.7906, + "mean_token_accuracy": 0.42637714743614197, + "num_tokens": 6505499496.0, + "step": 12725 + }, + { + "epoch": 3.4413196322336397, + "grad_norm": 3.375, + "learning_rate": 0.006211949680054256, + "loss": 3.0778, + "mean_token_accuracy": 0.41962891817092896, + "num_tokens": 6506023670.0, + "step": 12726 + }, + { + "epoch": 3.4415900486749593, + "grad_norm": 2.796875, + "learning_rate": 0.006210614875722715, + "loss": 2.9996, + "mean_token_accuracy": 0.4115416407585144, + "num_tokens": 6506547834.0, + "step": 12727 + }, + { + "epoch": 3.441860465116279, + "grad_norm": 3.203125, + "learning_rate": 0.0062092802183435995, + "loss": 3.019, + "mean_token_accuracy": 0.4225918650627136, + "num_tokens": 6507071947.0, + "step": 12728 + }, + { + "epoch": 3.4421308815575986, + "grad_norm": 2.953125, + "learning_rate": 0.006207945707957854, + "loss": 3.0321, + "mean_token_accuracy": 0.4052715003490448, + "num_tokens": 6507596186.0, + "step": 12729 + }, + { + "epoch": 3.4424012979989183, + "grad_norm": 3.296875, + "learning_rate": 0.006206611344606423, + "loss": 3.1064, + "mean_token_accuracy": 0.41423794627189636, + "num_tokens": 6508118130.0, + "step": 12730 + }, + { + "epoch": 3.442671714440238, + "grad_norm": 96.0, + "learning_rate": 0.006205277128330258, + "loss": 16.4503, + "mean_token_accuracy": 0.0, + "num_tokens": 6508589326.0, + "step": 12731 + }, + { + "epoch": 3.4429421308815575, + "grad_norm": 6.46875, + "learning_rate": 0.006203943059170286, + "loss": 3.1549, + "mean_token_accuracy": 0.39895856380462646, + "num_tokens": 6509113580.0, + "step": 12732 + }, + { + "epoch": 3.443212547322877, + "grad_norm": 2.234375, + "learning_rate": 0.00620260913716745, + "loss": 3.0395, + "mean_token_accuracy": 0.40220361948013306, + "num_tokens": 6509637796.0, + "step": 12733 + }, + { + "epoch": 3.443482963764197, + "grad_norm": 3.5625, + "learning_rate": 0.0062012753623626685, + "loss": 2.8539, + "mean_token_accuracy": 0.42984503507614136, + "num_tokens": 6510162060.0, + "step": 12734 + }, + { + "epoch": 3.4437533802055165, + "grad_norm": 2.703125, + "learning_rate": 0.006199941734796877, + "loss": 2.8608, + "mean_token_accuracy": 0.42070168256759644, + "num_tokens": 6510678825.0, + "step": 12735 + }, + { + "epoch": 3.444023796646836, + "grad_norm": 3.34375, + "learning_rate": 0.006198608254510982, + "loss": 3.0292, + "mean_token_accuracy": 0.42465534806251526, + "num_tokens": 6511158534.0, + "step": 12736 + }, + { + "epoch": 3.4442942130881558, + "grad_norm": 2.484375, + "learning_rate": 0.0061972749215459125, + "loss": 2.8907, + "mean_token_accuracy": 0.43087249994277954, + "num_tokens": 6511682671.0, + "step": 12737 + }, + { + "epoch": 3.4445646295294754, + "grad_norm": 2.375, + "learning_rate": 0.006195941735942571, + "loss": 2.9844, + "mean_token_accuracy": 0.44759830832481384, + "num_tokens": 6512131078.0, + "step": 12738 + }, + { + "epoch": 3.444835045970795, + "grad_norm": 2.6875, + "learning_rate": 0.006194608697741862, + "loss": 2.9142, + "mean_token_accuracy": 0.4172201156616211, + "num_tokens": 6512655280.0, + "step": 12739 + }, + { + "epoch": 3.4451054624121147, + "grad_norm": 3.328125, + "learning_rate": 0.006193275806984692, + "loss": 2.7597, + "mean_token_accuracy": 0.42565464973449707, + "num_tokens": 6513179504.0, + "step": 12740 + }, + { + "epoch": 3.4453758788534343, + "grad_norm": 3.109375, + "learning_rate": 0.006191943063711957, + "loss": 2.9697, + "mean_token_accuracy": 0.4285283386707306, + "num_tokens": 6513660376.0, + "step": 12741 + }, + { + "epoch": 3.445646295294754, + "grad_norm": 2.40625, + "learning_rate": 0.006190610467964545, + "loss": 2.9739, + "mean_token_accuracy": 0.41993409395217896, + "num_tokens": 6514158270.0, + "step": 12742 + }, + { + "epoch": 3.4459167117360736, + "grad_norm": 2.671875, + "learning_rate": 0.006189278019783351, + "loss": 2.9584, + "mean_token_accuracy": 0.42004597187042236, + "num_tokens": 6514682305.0, + "step": 12743 + }, + { + "epoch": 3.4461871281773933, + "grad_norm": 4.53125, + "learning_rate": 0.006187945719209254, + "loss": 2.9907, + "mean_token_accuracy": 0.41820448637008667, + "num_tokens": 6515206577.0, + "step": 12744 + }, + { + "epoch": 3.446457544618713, + "grad_norm": 2.1875, + "learning_rate": 0.00618661356628313, + "loss": 2.8212, + "mean_token_accuracy": 0.42064806818962097, + "num_tokens": 6515687205.0, + "step": 12745 + }, + { + "epoch": 3.4467279610600325, + "grad_norm": 7.53125, + "learning_rate": 0.006185281561045862, + "loss": 2.7498, + "mean_token_accuracy": 0.45384976267814636, + "num_tokens": 6516211304.0, + "step": 12746 + }, + { + "epoch": 3.446998377501352, + "grad_norm": 3.09375, + "learning_rate": 0.006183949703538309, + "loss": 2.8073, + "mean_token_accuracy": 0.3968319296836853, + "num_tokens": 6516735464.0, + "step": 12747 + }, + { + "epoch": 3.447268793942672, + "grad_norm": 4.03125, + "learning_rate": 0.006182617993801343, + "loss": 2.8643, + "mean_token_accuracy": 0.38841405510902405, + "num_tokens": 6517259651.0, + "step": 12748 + }, + { + "epoch": 3.4475392103839915, + "grad_norm": 2.515625, + "learning_rate": 0.006181286431875829, + "loss": 2.9263, + "mean_token_accuracy": 0.42705923318862915, + "num_tokens": 6517783794.0, + "step": 12749 + }, + { + "epoch": 3.447809626825311, + "grad_norm": 3.234375, + "learning_rate": 0.0061799550178026135, + "loss": 2.9945, + "mean_token_accuracy": 0.41760003566741943, + "num_tokens": 6518307864.0, + "step": 12750 + }, + { + "epoch": 3.4480800432666308, + "grad_norm": 8.6875, + "learning_rate": 0.006178623751622556, + "loss": 10.8549, + "mean_token_accuracy": 0.010634643025696278, + "num_tokens": 6518778602.0, + "step": 12751 + }, + { + "epoch": 3.4483504597079504, + "grad_norm": 7.96875, + "learning_rate": 0.0061772926333765014, + "loss": 3.1732, + "mean_token_accuracy": 0.4204147160053253, + "num_tokens": 6519302877.0, + "step": 12752 + }, + { + "epoch": 3.44862087614927, + "grad_norm": 3.015625, + "learning_rate": 0.0061759616631052875, + "loss": 2.7172, + "mean_token_accuracy": 0.41835713386535645, + "num_tokens": 6519809964.0, + "step": 12753 + }, + { + "epoch": 3.4488912925905897, + "grad_norm": 2.390625, + "learning_rate": 0.00617463084084976, + "loss": 3.1576, + "mean_token_accuracy": 0.3946981430053711, + "num_tokens": 6520334234.0, + "step": 12754 + }, + { + "epoch": 3.4491617090319093, + "grad_norm": 2.65625, + "learning_rate": 0.006173300166650748, + "loss": 3.044, + "mean_token_accuracy": 0.39990633726119995, + "num_tokens": 6520858405.0, + "step": 12755 + }, + { + "epoch": 3.449432125473229, + "grad_norm": 2.734375, + "learning_rate": 0.006171969640549078, + "loss": 2.9015, + "mean_token_accuracy": 0.4140692353248596, + "num_tokens": 6521382669.0, + "step": 12756 + }, + { + "epoch": 3.449702541914548, + "grad_norm": 3.15625, + "learning_rate": 0.006170639262585582, + "loss": 3.0999, + "mean_token_accuracy": 0.4169003963470459, + "num_tokens": 6521906864.0, + "step": 12757 + }, + { + "epoch": 3.4499729583558683, + "grad_norm": 2.796875, + "learning_rate": 0.006169309032801076, + "loss": 2.7161, + "mean_token_accuracy": 0.43675315380096436, + "num_tokens": 6522408143.0, + "step": 12758 + }, + { + "epoch": 3.4502433747971875, + "grad_norm": 2.34375, + "learning_rate": 0.006167978951236372, + "loss": 2.9, + "mean_token_accuracy": 0.4285617470741272, + "num_tokens": 6522932341.0, + "step": 12759 + }, + { + "epoch": 3.4505137912385075, + "grad_norm": 3.875, + "learning_rate": 0.006166649017932286, + "loss": 3.0511, + "mean_token_accuracy": 0.41525155305862427, + "num_tokens": 6523452837.0, + "step": 12760 + }, + { + "epoch": 3.4507842076798267, + "grad_norm": 2.65625, + "learning_rate": 0.006165319232929618, + "loss": 2.9371, + "mean_token_accuracy": 0.41386932134628296, + "num_tokens": 6523977020.0, + "step": 12761 + }, + { + "epoch": 3.4510546241211464, + "grad_norm": 3.125, + "learning_rate": 0.006163989596269178, + "loss": 3.0037, + "mean_token_accuracy": 0.3924921452999115, + "num_tokens": 6524501183.0, + "step": 12762 + }, + { + "epoch": 3.451325040562466, + "grad_norm": 3.09375, + "learning_rate": 0.006162660107991756, + "loss": 2.897, + "mean_token_accuracy": 0.4050649106502533, + "num_tokens": 6525025384.0, + "step": 12763 + }, + { + "epoch": 3.4515954570037857, + "grad_norm": 3.859375, + "learning_rate": 0.006161330768138152, + "loss": 2.8577, + "mean_token_accuracy": 0.4098131060600281, + "num_tokens": 6525549503.0, + "step": 12764 + }, + { + "epoch": 3.4518658734451053, + "grad_norm": 2.703125, + "learning_rate": 0.006160001576749143, + "loss": 2.8305, + "mean_token_accuracy": 0.42822059988975525, + "num_tokens": 6526073779.0, + "step": 12765 + }, + { + "epoch": 3.452136289886425, + "grad_norm": 3.515625, + "learning_rate": 0.006158672533865525, + "loss": 2.7791, + "mean_token_accuracy": 0.41749316453933716, + "num_tokens": 6526597919.0, + "step": 12766 + }, + { + "epoch": 3.4524067063277446, + "grad_norm": 2.515625, + "learning_rate": 0.00615734363952807, + "loss": 2.6526, + "mean_token_accuracy": 0.4415765404701233, + "num_tokens": 6527122059.0, + "step": 12767 + }, + { + "epoch": 3.4526771227690642, + "grad_norm": 2.96875, + "learning_rate": 0.006156014893777551, + "loss": 2.9744, + "mean_token_accuracy": 0.43362823128700256, + "num_tokens": 6527646252.0, + "step": 12768 + }, + { + "epoch": 3.452947539210384, + "grad_norm": 3.296875, + "learning_rate": 0.006154686296654744, + "loss": 2.8476, + "mean_token_accuracy": 0.43684521317481995, + "num_tokens": 6528170485.0, + "step": 12769 + }, + { + "epoch": 3.4532179556517035, + "grad_norm": 4.0625, + "learning_rate": 0.006153357848200409, + "loss": 3.1313, + "mean_token_accuracy": 0.417233943939209, + "num_tokens": 6528694680.0, + "step": 12770 + }, + { + "epoch": 3.453488372093023, + "grad_norm": 4.75, + "learning_rate": 0.006152029548455307, + "loss": 10.0337, + "mean_token_accuracy": 0.011580808088183403, + "num_tokens": 6529218818.0, + "step": 12771 + }, + { + "epoch": 3.453758788534343, + "grad_norm": 8.5625, + "learning_rate": 0.0061507013974601964, + "loss": 3.3517, + "mean_token_accuracy": 0.3672650456428528, + "num_tokens": 6529743043.0, + "step": 12772 + }, + { + "epoch": 3.4540292049756625, + "grad_norm": 2.359375, + "learning_rate": 0.006149373395255825, + "loss": 2.8806, + "mean_token_accuracy": 0.44225335121154785, + "num_tokens": 6530208007.0, + "step": 12773 + }, + { + "epoch": 3.454299621416982, + "grad_norm": 2.90625, + "learning_rate": 0.006148045541882947, + "loss": 2.9296, + "mean_token_accuracy": 0.407002717256546, + "num_tokens": 6530732252.0, + "step": 12774 + }, + { + "epoch": 3.4545700378583017, + "grad_norm": 2.828125, + "learning_rate": 0.006146717837382297, + "loss": 2.8496, + "mean_token_accuracy": 0.4093363285064697, + "num_tokens": 6531256489.0, + "step": 12775 + }, + { + "epoch": 3.4548404542996214, + "grad_norm": 3.765625, + "learning_rate": 0.006145390281794619, + "loss": 3.0708, + "mean_token_accuracy": 0.4191705584526062, + "num_tokens": 6531780630.0, + "step": 12776 + }, + { + "epoch": 3.455110870740941, + "grad_norm": 3.5625, + "learning_rate": 0.006144062875160641, + "loss": 3.018, + "mean_token_accuracy": 0.40653252601623535, + "num_tokens": 6532265018.0, + "step": 12777 + }, + { + "epoch": 3.4553812871822607, + "grad_norm": 3.0625, + "learning_rate": 0.006142735617521096, + "loss": 3.0415, + "mean_token_accuracy": 0.41825205087661743, + "num_tokens": 6532757875.0, + "step": 12778 + }, + { + "epoch": 3.4556517036235803, + "grad_norm": 2.828125, + "learning_rate": 0.006141408508916704, + "loss": 2.8377, + "mean_token_accuracy": 0.42857110500335693, + "num_tokens": 6533282146.0, + "step": 12779 + }, + { + "epoch": 3.4559221200649, + "grad_norm": 3.140625, + "learning_rate": 0.0061400815493881915, + "loss": 2.8843, + "mean_token_accuracy": 0.4304771423339844, + "num_tokens": 6533806309.0, + "step": 12780 + }, + { + "epoch": 3.4561925365062196, + "grad_norm": 2.828125, + "learning_rate": 0.006138754738976269, + "loss": 2.8498, + "mean_token_accuracy": 0.4264628291130066, + "num_tokens": 6534329669.0, + "step": 12781 + }, + { + "epoch": 3.4564629529475392, + "grad_norm": 2.53125, + "learning_rate": 0.006137428077721643, + "loss": 2.9905, + "mean_token_accuracy": 0.42873433232307434, + "num_tokens": 6534853811.0, + "step": 12782 + }, + { + "epoch": 3.456733369388859, + "grad_norm": 2.90625, + "learning_rate": 0.006136101565665027, + "loss": 2.8122, + "mean_token_accuracy": 0.4261264204978943, + "num_tokens": 6535377951.0, + "step": 12783 + }, + { + "epoch": 3.4570037858301785, + "grad_norm": 3.375, + "learning_rate": 0.006134775202847118, + "loss": 2.6434, + "mean_token_accuracy": 0.4394742548465729, + "num_tokens": 6535902143.0, + "step": 12784 + }, + { + "epoch": 3.457274202271498, + "grad_norm": 2.375, + "learning_rate": 0.006133448989308612, + "loss": 2.9107, + "mean_token_accuracy": 0.43863463401794434, + "num_tokens": 6536426361.0, + "step": 12785 + }, + { + "epoch": 3.457544618712818, + "grad_norm": 3.375, + "learning_rate": 0.006132122925090204, + "loss": 2.9406, + "mean_token_accuracy": 0.42830538749694824, + "num_tokens": 6536950626.0, + "step": 12786 + }, + { + "epoch": 3.4578150351541375, + "grad_norm": 2.796875, + "learning_rate": 0.0061307970102325825, + "loss": 2.9628, + "mean_token_accuracy": 0.43563470244407654, + "num_tokens": 6537414143.0, + "step": 12787 + }, + { + "epoch": 3.458085451595457, + "grad_norm": 3.5, + "learning_rate": 0.006129471244776423, + "loss": 2.9312, + "mean_token_accuracy": 0.4216850697994232, + "num_tokens": 6537938418.0, + "step": 12788 + }, + { + "epoch": 3.4583558680367767, + "grad_norm": 2.84375, + "learning_rate": 0.006128145628762413, + "loss": 3.0451, + "mean_token_accuracy": 0.4250650405883789, + "num_tokens": 6538462691.0, + "step": 12789 + }, + { + "epoch": 3.4586262844780964, + "grad_norm": 4.25, + "learning_rate": 0.006126820162231218, + "loss": 3.0483, + "mean_token_accuracy": 0.4197162687778473, + "num_tokens": 6538986904.0, + "step": 12790 + }, + { + "epoch": 3.458896700919416, + "grad_norm": 81.0, + "learning_rate": 0.006125494845223511, + "loss": 13.4877, + "mean_token_accuracy": 0.02839181013405323, + "num_tokens": 6539511079.0, + "step": 12791 + }, + { + "epoch": 3.4591671173607357, + "grad_norm": 8.1875, + "learning_rate": 0.006124169677779963, + "loss": 3.0619, + "mean_token_accuracy": 0.3912440240383148, + "num_tokens": 6540035340.0, + "step": 12792 + }, + { + "epoch": 3.4594375338020553, + "grad_norm": 2.421875, + "learning_rate": 0.006122844659941227, + "loss": 2.8387, + "mean_token_accuracy": 0.4241068363189697, + "num_tokens": 6540508231.0, + "step": 12793 + }, + { + "epoch": 3.459707950243375, + "grad_norm": 3.109375, + "learning_rate": 0.006121519791747956, + "loss": 3.1271, + "mean_token_accuracy": 0.40948349237442017, + "num_tokens": 6541032512.0, + "step": 12794 + }, + { + "epoch": 3.4599783666846946, + "grad_norm": 2.65625, + "learning_rate": 0.006120195073240808, + "loss": 2.785, + "mean_token_accuracy": 0.43174344301223755, + "num_tokens": 6541556727.0, + "step": 12795 + }, + { + "epoch": 3.4602487831260142, + "grad_norm": 2.65625, + "learning_rate": 0.006118870504460422, + "loss": 2.682, + "mean_token_accuracy": 0.4340519607067108, + "num_tokens": 6542047655.0, + "step": 12796 + }, + { + "epoch": 3.460519199567334, + "grad_norm": 2.578125, + "learning_rate": 0.0061175460854474475, + "loss": 2.9041, + "mean_token_accuracy": 0.4291509687900543, + "num_tokens": 6542571893.0, + "step": 12797 + }, + { + "epoch": 3.460789616008653, + "grad_norm": 3.390625, + "learning_rate": 0.006116221816242516, + "loss": 2.9515, + "mean_token_accuracy": 0.42343106865882874, + "num_tokens": 6543086996.0, + "step": 12798 + }, + { + "epoch": 3.461060032449973, + "grad_norm": 3.375, + "learning_rate": 0.006114897696886259, + "loss": 2.9652, + "mean_token_accuracy": 0.42188459634780884, + "num_tokens": 6543611201.0, + "step": 12799 + }, + { + "epoch": 3.4613304488912924, + "grad_norm": 3.296875, + "learning_rate": 0.00611357372741931, + "loss": 2.833, + "mean_token_accuracy": 0.4421658515930176, + "num_tokens": 6544072064.0, + "step": 12800 + }, + { + "epoch": 3.4616008653326125, + "grad_norm": 2.640625, + "learning_rate": 0.00611224990788229, + "loss": 2.8628, + "mean_token_accuracy": 0.41857823729515076, + "num_tokens": 6544596331.0, + "step": 12801 + }, + { + "epoch": 3.4618712817739317, + "grad_norm": 2.65625, + "learning_rate": 0.006110926238315812, + "loss": 2.7182, + "mean_token_accuracy": 0.44101253151893616, + "num_tokens": 6545120605.0, + "step": 12802 + }, + { + "epoch": 3.4621416982152513, + "grad_norm": 2.90625, + "learning_rate": 0.0061096027187604995, + "loss": 3.068, + "mean_token_accuracy": 0.42372047901153564, + "num_tokens": 6545644819.0, + "step": 12803 + }, + { + "epoch": 3.462412114656571, + "grad_norm": 3.734375, + "learning_rate": 0.0061082793492569535, + "loss": 2.9857, + "mean_token_accuracy": 0.4340437650680542, + "num_tokens": 6546146220.0, + "step": 12804 + }, + { + "epoch": 3.4626825310978906, + "grad_norm": 2.40625, + "learning_rate": 0.006106956129845782, + "loss": 2.9278, + "mean_token_accuracy": 0.42292821407318115, + "num_tokens": 6546670412.0, + "step": 12805 + }, + { + "epoch": 3.4629529475392102, + "grad_norm": 2.65625, + "learning_rate": 0.00610563306056759, + "loss": 2.835, + "mean_token_accuracy": 0.4388964772224426, + "num_tokens": 6547194657.0, + "step": 12806 + }, + { + "epoch": 3.46322336398053, + "grad_norm": 2.609375, + "learning_rate": 0.00610431014146297, + "loss": 2.8064, + "mean_token_accuracy": 0.4378582239151001, + "num_tokens": 6547661723.0, + "step": 12807 + }, + { + "epoch": 3.4634937804218495, + "grad_norm": 2.671875, + "learning_rate": 0.006102987372572506, + "loss": 2.7254, + "mean_token_accuracy": 0.43768858909606934, + "num_tokens": 6548180308.0, + "step": 12808 + }, + { + "epoch": 3.463764196863169, + "grad_norm": 2.546875, + "learning_rate": 0.006101664753936797, + "loss": 2.8121, + "mean_token_accuracy": 0.43539297580718994, + "num_tokens": 6548704587.0, + "step": 12809 + }, + { + "epoch": 3.464034613304489, + "grad_norm": 2.90625, + "learning_rate": 0.006100342285596416, + "loss": 2.8859, + "mean_token_accuracy": 0.4312574863433838, + "num_tokens": 6549228860.0, + "step": 12810 + }, + { + "epoch": 3.4643050297458085, + "grad_norm": 38.75, + "learning_rate": 0.006099019967591941, + "loss": 10.1895, + "mean_token_accuracy": 0.027741579338908195, + "num_tokens": 6549753063.0, + "step": 12811 + }, + { + "epoch": 3.464575446187128, + "grad_norm": 5.53125, + "learning_rate": 0.006097697799963948, + "loss": 3.1123, + "mean_token_accuracy": 0.40490102767944336, + "num_tokens": 6550276213.0, + "step": 12812 + }, + { + "epoch": 3.4648458626284477, + "grad_norm": 2.3125, + "learning_rate": 0.006096375782753005, + "loss": 2.9236, + "mean_token_accuracy": 0.4260515868663788, + "num_tokens": 6550739506.0, + "step": 12813 + }, + { + "epoch": 3.4651162790697674, + "grad_norm": 3.453125, + "learning_rate": 0.00609505391599967, + "loss": 2.8444, + "mean_token_accuracy": 0.46232113242149353, + "num_tokens": 6551164641.0, + "step": 12814 + }, + { + "epoch": 3.465386695511087, + "grad_norm": 3.625, + "learning_rate": 0.0060937321997445075, + "loss": 2.7583, + "mean_token_accuracy": 0.4983084201812744, + "num_tokens": 6551625477.0, + "step": 12815 + }, + { + "epoch": 3.4656571119524067, + "grad_norm": 2.546875, + "learning_rate": 0.006092410634028069, + "loss": 3.0367, + "mean_token_accuracy": 0.42913442850112915, + "num_tokens": 6552101171.0, + "step": 12816 + }, + { + "epoch": 3.4659275283937263, + "grad_norm": 2.90625, + "learning_rate": 0.006091089218890904, + "loss": 2.8751, + "mean_token_accuracy": 0.40669581294059753, + "num_tokens": 6552625401.0, + "step": 12817 + }, + { + "epoch": 3.466197944835046, + "grad_norm": 2.46875, + "learning_rate": 0.006089767954373555, + "loss": 2.9672, + "mean_token_accuracy": 0.4298151731491089, + "num_tokens": 6553149493.0, + "step": 12818 + }, + { + "epoch": 3.4664683612763656, + "grad_norm": 3.5625, + "learning_rate": 0.00608844684051657, + "loss": 3.153, + "mean_token_accuracy": 0.4067907929420471, + "num_tokens": 6553673717.0, + "step": 12819 + }, + { + "epoch": 3.4667387777176852, + "grad_norm": 2.890625, + "learning_rate": 0.0060871258773604765, + "loss": 2.9002, + "mean_token_accuracy": 0.43222182989120483, + "num_tokens": 6554197924.0, + "step": 12820 + }, + { + "epoch": 3.467009194159005, + "grad_norm": 3.28125, + "learning_rate": 0.006085805064945813, + "loss": 3.0318, + "mean_token_accuracy": 0.40903371572494507, + "num_tokens": 6554722160.0, + "step": 12821 + }, + { + "epoch": 3.4672796106003245, + "grad_norm": 3.265625, + "learning_rate": 0.006084484403313098, + "loss": 2.8226, + "mean_token_accuracy": 0.4266146719455719, + "num_tokens": 6555246429.0, + "step": 12822 + }, + { + "epoch": 3.467550027041644, + "grad_norm": 3.671875, + "learning_rate": 0.006083163892502859, + "loss": 3.0732, + "mean_token_accuracy": 0.41506314277648926, + "num_tokens": 6555770697.0, + "step": 12823 + }, + { + "epoch": 3.467820443482964, + "grad_norm": 2.984375, + "learning_rate": 0.006081843532555611, + "loss": 2.7028, + "mean_token_accuracy": 0.44194671511650085, + "num_tokens": 6556294912.0, + "step": 12824 + }, + { + "epoch": 3.4680908599242835, + "grad_norm": 3.453125, + "learning_rate": 0.006080523323511867, + "loss": 3.0064, + "mean_token_accuracy": 0.40507790446281433, + "num_tokens": 6556819130.0, + "step": 12825 + }, + { + "epoch": 3.468361276365603, + "grad_norm": 3.109375, + "learning_rate": 0.006079203265412134, + "loss": 2.7561, + "mean_token_accuracy": 0.4325587749481201, + "num_tokens": 6557285077.0, + "step": 12826 + }, + { + "epoch": 3.4686316928069227, + "grad_norm": 3.15625, + "learning_rate": 0.006077883358296918, + "loss": 2.9159, + "mean_token_accuracy": 0.4375937581062317, + "num_tokens": 6557728135.0, + "step": 12827 + }, + { + "epoch": 3.4689021092482424, + "grad_norm": 3.3125, + "learning_rate": 0.0060765636022067126, + "loss": 2.9449, + "mean_token_accuracy": 0.403622567653656, + "num_tokens": 6558252315.0, + "step": 12828 + }, + { + "epoch": 3.469172525689562, + "grad_norm": 3.0, + "learning_rate": 0.006075243997182016, + "loss": 2.8995, + "mean_token_accuracy": 0.42984795570373535, + "num_tokens": 6558744391.0, + "step": 12829 + }, + { + "epoch": 3.4694429421308817, + "grad_norm": 3.109375, + "learning_rate": 0.0060739245432633185, + "loss": 2.8596, + "mean_token_accuracy": 0.43723541498184204, + "num_tokens": 6559268613.0, + "step": 12830 + }, + { + "epoch": 3.4697133585722013, + "grad_norm": 43.25, + "learning_rate": 0.006072605240491098, + "loss": 12.4743, + "mean_token_accuracy": 0.007108541205525398, + "num_tokens": 6559792880.0, + "step": 12831 + }, + { + "epoch": 3.469983775013521, + "grad_norm": 6.15625, + "learning_rate": 0.006071286088905844, + "loss": 2.8887, + "mean_token_accuracy": 0.39781486988067627, + "num_tokens": 6560306413.0, + "step": 12832 + }, + { + "epoch": 3.4702541914548406, + "grad_norm": 2.40625, + "learning_rate": 0.006069967088548023, + "loss": 3.0229, + "mean_token_accuracy": 0.4446989893913269, + "num_tokens": 6560770387.0, + "step": 12833 + }, + { + "epoch": 3.4705246078961602, + "grad_norm": 2.84375, + "learning_rate": 0.00606864823945811, + "loss": 3.0272, + "mean_token_accuracy": 0.40857380628585815, + "num_tokens": 6561294648.0, + "step": 12834 + }, + { + "epoch": 3.47079502433748, + "grad_norm": 3.296875, + "learning_rate": 0.006067329541676574, + "loss": 3.019, + "mean_token_accuracy": 0.4037286043167114, + "num_tokens": 6561818928.0, + "step": 12835 + }, + { + "epoch": 3.4710654407787995, + "grad_norm": 5.03125, + "learning_rate": 0.006066010995243874, + "loss": 2.8636, + "mean_token_accuracy": 0.43653106689453125, + "num_tokens": 6562343113.0, + "step": 12836 + }, + { + "epoch": 3.471335857220119, + "grad_norm": 2.46875, + "learning_rate": 0.006064692600200463, + "loss": 3.05, + "mean_token_accuracy": 0.4147547781467438, + "num_tokens": 6562867292.0, + "step": 12837 + }, + { + "epoch": 3.471606273661439, + "grad_norm": 3.78125, + "learning_rate": 0.006063374356586802, + "loss": 2.9349, + "mean_token_accuracy": 0.4179683327674866, + "num_tokens": 6563391248.0, + "step": 12838 + }, + { + "epoch": 3.471876690102758, + "grad_norm": 1.8984375, + "learning_rate": 0.006062056264443331, + "loss": 2.858, + "mean_token_accuracy": 0.4406135380268097, + "num_tokens": 6563915527.0, + "step": 12839 + }, + { + "epoch": 3.472147106544078, + "grad_norm": 3.140625, + "learning_rate": 0.006060738323810493, + "loss": 2.9688, + "mean_token_accuracy": 0.4316723942756653, + "num_tokens": 6564439741.0, + "step": 12840 + }, + { + "epoch": 3.4724175229853973, + "grad_norm": 3.015625, + "learning_rate": 0.006059420534728732, + "loss": 2.8088, + "mean_token_accuracy": 0.4293519854545593, + "num_tokens": 6564964000.0, + "step": 12841 + }, + { + "epoch": 3.4726879394267174, + "grad_norm": 3.03125, + "learning_rate": 0.0060581028972384755, + "loss": 3.0808, + "mean_token_accuracy": 0.4135041832923889, + "num_tokens": 6565474808.0, + "step": 12842 + }, + { + "epoch": 3.4729583558680366, + "grad_norm": 2.96875, + "learning_rate": 0.006056785411380157, + "loss": 2.9238, + "mean_token_accuracy": 0.4242091774940491, + "num_tokens": 6565999023.0, + "step": 12843 + }, + { + "epoch": 3.4732287723093562, + "grad_norm": 2.765625, + "learning_rate": 0.006055468077194202, + "loss": 2.8266, + "mean_token_accuracy": 0.4424933195114136, + "num_tokens": 6566494280.0, + "step": 12844 + }, + { + "epoch": 3.473499188750676, + "grad_norm": 2.890625, + "learning_rate": 0.006054150894721022, + "loss": 2.9302, + "mean_token_accuracy": 0.4368633031845093, + "num_tokens": 6567018537.0, + "step": 12845 + }, + { + "epoch": 3.4737696051919955, + "grad_norm": 3.203125, + "learning_rate": 0.00605283386400104, + "loss": 3.0992, + "mean_token_accuracy": 0.40353962779045105, + "num_tokens": 6567542815.0, + "step": 12846 + }, + { + "epoch": 3.474040021633315, + "grad_norm": 3.296875, + "learning_rate": 0.006051516985074661, + "loss": 2.9989, + "mean_token_accuracy": 0.43228334188461304, + "num_tokens": 6568040305.0, + "step": 12847 + }, + { + "epoch": 3.474310438074635, + "grad_norm": 3.140625, + "learning_rate": 0.006050200257982295, + "loss": 3.0533, + "mean_token_accuracy": 0.42547607421875, + "num_tokens": 6568529304.0, + "step": 12848 + }, + { + "epoch": 3.4745808545159544, + "grad_norm": 3.078125, + "learning_rate": 0.006048883682764342, + "loss": 2.9321, + "mean_token_accuracy": 0.4181589186191559, + "num_tokens": 6569053259.0, + "step": 12849 + }, + { + "epoch": 3.474851270957274, + "grad_norm": 3.171875, + "learning_rate": 0.006047567259461199, + "loss": 2.9091, + "mean_token_accuracy": 0.41842129826545715, + "num_tokens": 6569577395.0, + "step": 12850 + }, + { + "epoch": 3.4751216873985937, + "grad_norm": 15.3125, + "learning_rate": 0.006046250988113254, + "loss": 9.9991, + "mean_token_accuracy": 0.007752190809696913, + "num_tokens": 6570101670.0, + "step": 12851 + }, + { + "epoch": 3.4753921038399134, + "grad_norm": 5.65625, + "learning_rate": 0.006044934868760901, + "loss": 3.1439, + "mean_token_accuracy": 0.40334033966064453, + "num_tokens": 6570625942.0, + "step": 12852 + }, + { + "epoch": 3.475662520281233, + "grad_norm": 2.078125, + "learning_rate": 0.006043618901444515, + "loss": 3.1301, + "mean_token_accuracy": 0.3974529504776001, + "num_tokens": 6571150214.0, + "step": 12853 + }, + { + "epoch": 3.4759329367225527, + "grad_norm": 3.09375, + "learning_rate": 0.006042303086204477, + "loss": 2.8566, + "mean_token_accuracy": 0.4396490752696991, + "num_tokens": 6571674366.0, + "step": 12854 + }, + { + "epoch": 3.4762033531638723, + "grad_norm": 2.6875, + "learning_rate": 0.0060409874230811615, + "loss": 2.9556, + "mean_token_accuracy": 0.44257891178131104, + "num_tokens": 6572096859.0, + "step": 12855 + }, + { + "epoch": 3.476473769605192, + "grad_norm": 2.921875, + "learning_rate": 0.006039671912114935, + "loss": 2.9002, + "mean_token_accuracy": 0.43676745891571045, + "num_tokens": 6572555713.0, + "step": 12856 + }, + { + "epoch": 3.4767441860465116, + "grad_norm": 2.484375, + "learning_rate": 0.00603835655334616, + "loss": 2.7898, + "mean_token_accuracy": 0.43316709995269775, + "num_tokens": 6573079996.0, + "step": 12857 + }, + { + "epoch": 3.4770146024878312, + "grad_norm": 3.3125, + "learning_rate": 0.006037041346815199, + "loss": 3.0045, + "mean_token_accuracy": 0.40107789635658264, + "num_tokens": 6573580037.0, + "step": 12858 + }, + { + "epoch": 3.477285018929151, + "grad_norm": 4.15625, + "learning_rate": 0.006035726292562405, + "loss": 2.9324, + "mean_token_accuracy": 0.430147647857666, + "num_tokens": 6574060792.0, + "step": 12859 + }, + { + "epoch": 3.4775554353704705, + "grad_norm": 3.40625, + "learning_rate": 0.006034411390628123, + "loss": 3.0745, + "mean_token_accuracy": 0.4196023941040039, + "num_tokens": 6574584940.0, + "step": 12860 + }, + { + "epoch": 3.47782585181179, + "grad_norm": 3.3125, + "learning_rate": 0.006033096641052705, + "loss": 3.0078, + "mean_token_accuracy": 0.4349917769432068, + "num_tokens": 6574963503.0, + "step": 12861 + }, + { + "epoch": 3.47809626825311, + "grad_norm": 2.4375, + "learning_rate": 0.006031782043876488, + "loss": 2.9502, + "mean_token_accuracy": 0.4389459490776062, + "num_tokens": 6575487707.0, + "step": 12862 + }, + { + "epoch": 3.4783666846944294, + "grad_norm": 3.375, + "learning_rate": 0.006030467599139808, + "loss": 2.861, + "mean_token_accuracy": 0.42381221055984497, + "num_tokens": 6576011914.0, + "step": 12863 + }, + { + "epoch": 3.478637101135749, + "grad_norm": 2.9375, + "learning_rate": 0.006029153306883, + "loss": 2.8506, + "mean_token_accuracy": 0.42277246713638306, + "num_tokens": 6576536140.0, + "step": 12864 + }, + { + "epoch": 3.4789075175770687, + "grad_norm": 2.90625, + "learning_rate": 0.006027839167146383, + "loss": 2.8857, + "mean_token_accuracy": 0.44428473711013794, + "num_tokens": 6577060222.0, + "step": 12865 + }, + { + "epoch": 3.4791779340183884, + "grad_norm": 3.140625, + "learning_rate": 0.006026525179970286, + "loss": 2.86, + "mean_token_accuracy": 0.4540223777294159, + "num_tokens": 6577584491.0, + "step": 12866 + }, + { + "epoch": 3.479448350459708, + "grad_norm": 2.78125, + "learning_rate": 0.0060252113453950215, + "loss": 2.9547, + "mean_token_accuracy": 0.41913437843322754, + "num_tokens": 6578108582.0, + "step": 12867 + }, + { + "epoch": 3.4797187669010277, + "grad_norm": 2.78125, + "learning_rate": 0.0060238976634609, + "loss": 2.7482, + "mean_token_accuracy": 0.4396224915981293, + "num_tokens": 6578620749.0, + "step": 12868 + }, + { + "epoch": 3.4799891833423473, + "grad_norm": 3.25, + "learning_rate": 0.006022584134208236, + "loss": 3.0133, + "mean_token_accuracy": 0.43161213397979736, + "num_tokens": 6579045961.0, + "step": 12869 + }, + { + "epoch": 3.480259599783667, + "grad_norm": 2.625, + "learning_rate": 0.006021270757677328, + "loss": 2.7717, + "mean_token_accuracy": 0.44440269470214844, + "num_tokens": 6579570206.0, + "step": 12870 + }, + { + "epoch": 3.4805300162249866, + "grad_norm": 45.25, + "learning_rate": 0.0060199575339084715, + "loss": 12.8898, + "mean_token_accuracy": 0.038610637187957764, + "num_tokens": 6580094323.0, + "step": 12871 + }, + { + "epoch": 3.4808004326663062, + "grad_norm": 5.78125, + "learning_rate": 0.006018644462941967, + "loss": 2.8445, + "mean_token_accuracy": 0.4306049346923828, + "num_tokens": 6580556775.0, + "step": 12872 + }, + { + "epoch": 3.481070849107626, + "grad_norm": 2.3125, + "learning_rate": 0.0060173315448180985, + "loss": 2.972, + "mean_token_accuracy": 0.4245791733264923, + "num_tokens": 6581081001.0, + "step": 12873 + }, + { + "epoch": 3.4813412655489455, + "grad_norm": 3.109375, + "learning_rate": 0.006016018779577148, + "loss": 2.9887, + "mean_token_accuracy": 0.42088907957077026, + "num_tokens": 6581605149.0, + "step": 12874 + }, + { + "epoch": 3.481611681990265, + "grad_norm": 3.0, + "learning_rate": 0.006014706167259398, + "loss": 3.0736, + "mean_token_accuracy": 0.4145473837852478, + "num_tokens": 6582129261.0, + "step": 12875 + }, + { + "epoch": 3.481882098431585, + "grad_norm": 3.671875, + "learning_rate": 0.006013393707905128, + "loss": 3.0091, + "mean_token_accuracy": 0.4041707515716553, + "num_tokens": 6582653438.0, + "step": 12876 + }, + { + "epoch": 3.4821525148729044, + "grad_norm": 3.203125, + "learning_rate": 0.0060120814015546, + "loss": 2.8091, + "mean_token_accuracy": 0.4466581344604492, + "num_tokens": 6583177707.0, + "step": 12877 + }, + { + "epoch": 3.482422931314224, + "grad_norm": 3.28125, + "learning_rate": 0.006010769248248086, + "loss": 3.1358, + "mean_token_accuracy": 0.4124452471733093, + "num_tokens": 6583701977.0, + "step": 12878 + }, + { + "epoch": 3.4826933477555437, + "grad_norm": 2.765625, + "learning_rate": 0.006009457248025844, + "loss": 2.7619, + "mean_token_accuracy": 0.4272199273109436, + "num_tokens": 6584167592.0, + "step": 12879 + }, + { + "epoch": 3.482963764196863, + "grad_norm": 2.328125, + "learning_rate": 0.006008145400928127, + "loss": 2.8906, + "mean_token_accuracy": 0.42468661069869995, + "num_tokens": 6584691695.0, + "step": 12880 + }, + { + "epoch": 3.483234180638183, + "grad_norm": 3.203125, + "learning_rate": 0.006006833706995192, + "loss": 2.8714, + "mean_token_accuracy": 0.4115753769874573, + "num_tokens": 6585208597.0, + "step": 12881 + }, + { + "epoch": 3.483504597079502, + "grad_norm": 2.578125, + "learning_rate": 0.006005522166267283, + "loss": 2.9968, + "mean_token_accuracy": 0.41703516244888306, + "num_tokens": 6585732745.0, + "step": 12882 + }, + { + "epoch": 3.4837750135208223, + "grad_norm": 2.953125, + "learning_rate": 0.006004210778784638, + "loss": 3.0373, + "mean_token_accuracy": 0.41630101203918457, + "num_tokens": 6586256986.0, + "step": 12883 + }, + { + "epoch": 3.4840454299621415, + "grad_norm": 3.34375, + "learning_rate": 0.006002899544587503, + "loss": 2.8653, + "mean_token_accuracy": 0.426011860370636, + "num_tokens": 6586781168.0, + "step": 12884 + }, + { + "epoch": 3.484315846403461, + "grad_norm": 3.84375, + "learning_rate": 0.006001588463716103, + "loss": 2.9642, + "mean_token_accuracy": 0.41123515367507935, + "num_tokens": 6587295512.0, + "step": 12885 + }, + { + "epoch": 3.484586262844781, + "grad_norm": 4.03125, + "learning_rate": 0.006000277536210666, + "loss": 3.0594, + "mean_token_accuracy": 0.40523627400398254, + "num_tokens": 6587807068.0, + "step": 12886 + }, + { + "epoch": 3.4848566792861004, + "grad_norm": 4.28125, + "learning_rate": 0.005998966762111422, + "loss": 3.1521, + "mean_token_accuracy": 0.39956068992614746, + "num_tokens": 6588331259.0, + "step": 12887 + }, + { + "epoch": 3.48512709572742, + "grad_norm": 3.6875, + "learning_rate": 0.00599765614145858, + "loss": 2.9809, + "mean_token_accuracy": 0.42469531297683716, + "num_tokens": 6588827974.0, + "step": 12888 + }, + { + "epoch": 3.4853975121687397, + "grad_norm": 3.390625, + "learning_rate": 0.0059963456742923625, + "loss": 3.008, + "mean_token_accuracy": 0.42353448271751404, + "num_tokens": 6589352196.0, + "step": 12889 + }, + { + "epoch": 3.4856679286100594, + "grad_norm": 3.234375, + "learning_rate": 0.005995035360652969, + "loss": 3.0649, + "mean_token_accuracy": 0.38628479838371277, + "num_tokens": 6589876480.0, + "step": 12890 + }, + { + "epoch": 3.485938345051379, + "grad_norm": 42.25, + "learning_rate": 0.005993725200580613, + "loss": 9.6519, + "mean_token_accuracy": 0.027233220636844635, + "num_tokens": 6590400754.0, + "step": 12891 + }, + { + "epoch": 3.4862087614926986, + "grad_norm": 6.5625, + "learning_rate": 0.005992415194115493, + "loss": 3.1267, + "mean_token_accuracy": 0.41029417514801025, + "num_tokens": 6590924915.0, + "step": 12892 + }, + { + "epoch": 3.4864791779340183, + "grad_norm": 2.90625, + "learning_rate": 0.005991105341297799, + "loss": 2.6535, + "mean_token_accuracy": 0.4761717915534973, + "num_tokens": 6591448942.0, + "step": 12893 + }, + { + "epoch": 3.486749594375338, + "grad_norm": 2.546875, + "learning_rate": 0.005989795642167721, + "loss": 2.8856, + "mean_token_accuracy": 0.4402729868888855, + "num_tokens": 6591973209.0, + "step": 12894 + }, + { + "epoch": 3.4870200108166576, + "grad_norm": 3.609375, + "learning_rate": 0.005988486096765451, + "loss": 2.8635, + "mean_token_accuracy": 0.41079750657081604, + "num_tokens": 6592497345.0, + "step": 12895 + }, + { + "epoch": 3.487290427257977, + "grad_norm": 3.15625, + "learning_rate": 0.005987176705131163, + "loss": 2.9229, + "mean_token_accuracy": 0.41792044043540955, + "num_tokens": 6593021530.0, + "step": 12896 + }, + { + "epoch": 3.487560843699297, + "grad_norm": 3.09375, + "learning_rate": 0.005985867467305035, + "loss": 3.0403, + "mean_token_accuracy": 0.42388373613357544, + "num_tokens": 6593545689.0, + "step": 12897 + }, + { + "epoch": 3.4878312601406165, + "grad_norm": 2.890625, + "learning_rate": 0.005984558383327241, + "loss": 2.9201, + "mean_token_accuracy": 0.42698606848716736, + "num_tokens": 6594069841.0, + "step": 12898 + }, + { + "epoch": 3.488101676581936, + "grad_norm": 2.46875, + "learning_rate": 0.005983249453237944, + "loss": 2.9235, + "mean_token_accuracy": 0.43961188197135925, + "num_tokens": 6594594122.0, + "step": 12899 + }, + { + "epoch": 3.488372093023256, + "grad_norm": 3.03125, + "learning_rate": 0.005981940677077305, + "loss": 2.9232, + "mean_token_accuracy": 0.4166582524776459, + "num_tokens": 6595118278.0, + "step": 12900 + }, + { + "epoch": 3.4886425094645754, + "grad_norm": 3.015625, + "learning_rate": 0.005980632054885486, + "loss": 2.7887, + "mean_token_accuracy": 0.42717868089675903, + "num_tokens": 6595642545.0, + "step": 12901 + }, + { + "epoch": 3.488912925905895, + "grad_norm": 2.578125, + "learning_rate": 0.005979323586702634, + "loss": 2.9336, + "mean_token_accuracy": 0.4273443818092346, + "num_tokens": 6596166732.0, + "step": 12902 + }, + { + "epoch": 3.4891833423472147, + "grad_norm": 3.78125, + "learning_rate": 0.005978015272568902, + "loss": 3.1197, + "mean_token_accuracy": 0.4059687554836273, + "num_tokens": 6596690906.0, + "step": 12903 + }, + { + "epoch": 3.4894537587885344, + "grad_norm": 3.859375, + "learning_rate": 0.005976707112524427, + "loss": 2.8645, + "mean_token_accuracy": 0.4593440294265747, + "num_tokens": 6597152249.0, + "step": 12904 + }, + { + "epoch": 3.489724175229854, + "grad_norm": 4.0, + "learning_rate": 0.005975399106609352, + "loss": 2.8276, + "mean_token_accuracy": 0.42144322395324707, + "num_tokens": 6597676465.0, + "step": 12905 + }, + { + "epoch": 3.4899945916711737, + "grad_norm": 2.828125, + "learning_rate": 0.005974091254863808, + "loss": 2.8083, + "mean_token_accuracy": 0.4248223304748535, + "num_tokens": 6598200737.0, + "step": 12906 + }, + { + "epoch": 3.4902650081124933, + "grad_norm": 3.59375, + "learning_rate": 0.0059727835573279275, + "loss": 3.0195, + "mean_token_accuracy": 0.41754209995269775, + "num_tokens": 6598724959.0, + "step": 12907 + }, + { + "epoch": 3.490535424553813, + "grad_norm": 2.90625, + "learning_rate": 0.00597147601404183, + "loss": 2.9892, + "mean_token_accuracy": 0.41642308235168457, + "num_tokens": 6599249167.0, + "step": 12908 + }, + { + "epoch": 3.4908058409951326, + "grad_norm": 3.1875, + "learning_rate": 0.005970168625045635, + "loss": 2.8565, + "mean_token_accuracy": 0.4330027401447296, + "num_tokens": 6599743369.0, + "step": 12909 + }, + { + "epoch": 3.4910762574364522, + "grad_norm": 3.078125, + "learning_rate": 0.0059688613903794594, + "loss": 3.0698, + "mean_token_accuracy": 0.39229366183280945, + "num_tokens": 6600267645.0, + "step": 12910 + }, + { + "epoch": 3.491346673877772, + "grad_norm": 14.875, + "learning_rate": 0.005967554310083409, + "loss": 9.2112, + "mean_token_accuracy": 0.010281339287757874, + "num_tokens": 6600743218.0, + "step": 12911 + }, + { + "epoch": 3.4916170903190915, + "grad_norm": 6.4375, + "learning_rate": 0.0059662473841975965, + "loss": 3.1244, + "mean_token_accuracy": 0.39748334884643555, + "num_tokens": 6601209465.0, + "step": 12912 + }, + { + "epoch": 3.491887506760411, + "grad_norm": 2.171875, + "learning_rate": 0.005964940612762117, + "loss": 2.7577, + "mean_token_accuracy": 0.41895467042922974, + "num_tokens": 6601733684.0, + "step": 12913 + }, + { + "epoch": 3.492157923201731, + "grad_norm": 12.6875, + "learning_rate": 0.005963633995817063, + "loss": 2.7589, + "mean_token_accuracy": 0.44884470105171204, + "num_tokens": 6602257809.0, + "step": 12914 + }, + { + "epoch": 3.4924283396430504, + "grad_norm": 4.5, + "learning_rate": 0.005962327533402533, + "loss": 3.0674, + "mean_token_accuracy": 0.4120147228240967, + "num_tokens": 6602781932.0, + "step": 12915 + }, + { + "epoch": 3.49269875608437, + "grad_norm": 2.484375, + "learning_rate": 0.005961021225558609, + "loss": 3.0864, + "mean_token_accuracy": 0.4012655019760132, + "num_tokens": 6603306208.0, + "step": 12916 + }, + { + "epoch": 3.4929691725256897, + "grad_norm": 3.328125, + "learning_rate": 0.005959715072325369, + "loss": 2.8857, + "mean_token_accuracy": 0.41752785444259644, + "num_tokens": 6603794145.0, + "step": 12917 + }, + { + "epoch": 3.4932395889670094, + "grad_norm": 2.5, + "learning_rate": 0.005958409073742892, + "loss": 2.7623, + "mean_token_accuracy": 0.4387392997741699, + "num_tokens": 6604267136.0, + "step": 12918 + }, + { + "epoch": 3.493510005408329, + "grad_norm": 2.703125, + "learning_rate": 0.005957103229851255, + "loss": 2.9555, + "mean_token_accuracy": 0.4078502655029297, + "num_tokens": 6604791344.0, + "step": 12919 + }, + { + "epoch": 3.4937804218496487, + "grad_norm": 2.84375, + "learning_rate": 0.005955797540690519, + "loss": 2.9564, + "mean_token_accuracy": 0.4324687719345093, + "num_tokens": 6605315607.0, + "step": 12920 + }, + { + "epoch": 3.494050838290968, + "grad_norm": 3.421875, + "learning_rate": 0.00595449200630075, + "loss": 3.0481, + "mean_token_accuracy": 0.4175356328487396, + "num_tokens": 6605839822.0, + "step": 12921 + }, + { + "epoch": 3.494321254732288, + "grad_norm": 2.8125, + "learning_rate": 0.005953186626722005, + "loss": 3.0089, + "mean_token_accuracy": 0.42524829506874084, + "num_tokens": 6606307362.0, + "step": 12922 + }, + { + "epoch": 3.494591671173607, + "grad_norm": 3.28125, + "learning_rate": 0.005951881401994334, + "loss": 3.0029, + "mean_token_accuracy": 0.40619686245918274, + "num_tokens": 6606831445.0, + "step": 12923 + }, + { + "epoch": 3.4948620876149272, + "grad_norm": 3.015625, + "learning_rate": 0.005950576332157789, + "loss": 2.7624, + "mean_token_accuracy": 0.42953234910964966, + "num_tokens": 6607355729.0, + "step": 12924 + }, + { + "epoch": 3.4951325040562464, + "grad_norm": 2.71875, + "learning_rate": 0.005949271417252411, + "loss": 2.7519, + "mean_token_accuracy": 0.42713314294815063, + "num_tokens": 6607879922.0, + "step": 12925 + }, + { + "epoch": 3.495402920497566, + "grad_norm": 3.109375, + "learning_rate": 0.005947966657318237, + "loss": 2.8563, + "mean_token_accuracy": 0.40708813071250916, + "num_tokens": 6608404139.0, + "step": 12926 + }, + { + "epoch": 3.4956733369388857, + "grad_norm": 4.09375, + "learning_rate": 0.005946662052395305, + "loss": 2.4192, + "mean_token_accuracy": 0.4785059690475464, + "num_tokens": 6608906903.0, + "step": 12927 + }, + { + "epoch": 3.4959437533802054, + "grad_norm": 2.4375, + "learning_rate": 0.005945357602523642, + "loss": 2.9184, + "mean_token_accuracy": 0.4426950216293335, + "num_tokens": 6609381898.0, + "step": 12928 + }, + { + "epoch": 3.496214169821525, + "grad_norm": 3.640625, + "learning_rate": 0.005944053307743271, + "loss": 2.8802, + "mean_token_accuracy": 0.428138792514801, + "num_tokens": 6609906089.0, + "step": 12929 + }, + { + "epoch": 3.4964845862628446, + "grad_norm": 3.109375, + "learning_rate": 0.0059427491680942145, + "loss": 3.0463, + "mean_token_accuracy": 0.40228235721588135, + "num_tokens": 6610430349.0, + "step": 12930 + }, + { + "epoch": 3.4967550027041643, + "grad_norm": 19.125, + "learning_rate": 0.005941445183616484, + "loss": 9.6805, + "mean_token_accuracy": 0.005336185917258263, + "num_tokens": 6610883739.0, + "step": 12931 + }, + { + "epoch": 3.497025419145484, + "grad_norm": 6.5625, + "learning_rate": 0.005940141354350095, + "loss": 3.2491, + "mean_token_accuracy": 0.39194604754447937, + "num_tokens": 6611384444.0, + "step": 12932 + }, + { + "epoch": 3.4972958355868036, + "grad_norm": 2.359375, + "learning_rate": 0.005938837680335045, + "loss": 2.8198, + "mean_token_accuracy": 0.43647080659866333, + "num_tokens": 6611908582.0, + "step": 12933 + }, + { + "epoch": 3.497566252028123, + "grad_norm": 2.671875, + "learning_rate": 0.00593753416161134, + "loss": 3.1075, + "mean_token_accuracy": 0.42657893896102905, + "num_tokens": 6612432857.0, + "step": 12934 + }, + { + "epoch": 3.497836668469443, + "grad_norm": 3.03125, + "learning_rate": 0.005936230798218977, + "loss": 3.0409, + "mean_token_accuracy": 0.4229467511177063, + "num_tokens": 6612892455.0, + "step": 12935 + }, + { + "epoch": 3.4981070849107625, + "grad_norm": 2.765625, + "learning_rate": 0.005934927590197945, + "loss": 2.8201, + "mean_token_accuracy": 0.450292706489563, + "num_tokens": 6613398828.0, + "step": 12936 + }, + { + "epoch": 3.498377501352082, + "grad_norm": 2.546875, + "learning_rate": 0.005933624537588226, + "loss": 3.0037, + "mean_token_accuracy": 0.4164264500141144, + "num_tokens": 6613922989.0, + "step": 12937 + }, + { + "epoch": 3.498647917793402, + "grad_norm": 2.46875, + "learning_rate": 0.00593232164042981, + "loss": 2.833, + "mean_token_accuracy": 0.4387053847312927, + "num_tokens": 6614447257.0, + "step": 12938 + }, + { + "epoch": 3.4989183342347214, + "grad_norm": 2.765625, + "learning_rate": 0.005931018898762668, + "loss": 3.1394, + "mean_token_accuracy": 0.4122331142425537, + "num_tokens": 6614937044.0, + "step": 12939 + }, + { + "epoch": 3.499188750676041, + "grad_norm": 2.9375, + "learning_rate": 0.005929716312626771, + "loss": 2.8152, + "mean_token_accuracy": 0.4398287832736969, + "num_tokens": 6615461218.0, + "step": 12940 + }, + { + "epoch": 3.4994591671173607, + "grad_norm": 2.921875, + "learning_rate": 0.0059284138820620905, + "loss": 2.8723, + "mean_token_accuracy": 0.42079076170921326, + "num_tokens": 6615985445.0, + "step": 12941 + }, + { + "epoch": 3.4997295835586804, + "grad_norm": 3.46875, + "learning_rate": 0.005927111607108586, + "loss": 3.0297, + "mean_token_accuracy": 0.42388466000556946, + "num_tokens": 6616426148.0, + "step": 12942 + }, + { + "epoch": 3.5, + "grad_norm": 2.703125, + "learning_rate": 0.0059258094878062145, + "loss": 2.5846, + "mean_token_accuracy": 0.4622962474822998, + "num_tokens": 6616922858.0, + "step": 12943 + }, + { + "epoch": 3.5002704164413196, + "grad_norm": 2.328125, + "learning_rate": 0.005924507524194932, + "loss": 2.9327, + "mean_token_accuracy": 0.4415090084075928, + "num_tokens": 6617447076.0, + "step": 12944 + }, + { + "epoch": 3.5005408328826393, + "grad_norm": 22.625, + "learning_rate": 0.005923205716314682, + "loss": 2.7829, + "mean_token_accuracy": 0.477575421333313, + "num_tokens": 6617934461.0, + "step": 12945 + }, + { + "epoch": 3.500811249323959, + "grad_norm": 5.84375, + "learning_rate": 0.005921904064205413, + "loss": 3.0831, + "mean_token_accuracy": 0.4007262587547302, + "num_tokens": 6618458717.0, + "step": 12946 + }, + { + "epoch": 3.5010816657652786, + "grad_norm": 20.875, + "learning_rate": 0.005920602567907059, + "loss": 3.0492, + "mean_token_accuracy": 0.44065019488334656, + "num_tokens": 6618983005.0, + "step": 12947 + }, + { + "epoch": 3.501352082206598, + "grad_norm": 5.0, + "learning_rate": 0.005919301227459558, + "loss": 2.8841, + "mean_token_accuracy": 0.4134138822555542, + "num_tokens": 6619507263.0, + "step": 12948 + }, + { + "epoch": 3.501622498647918, + "grad_norm": 2.25, + "learning_rate": 0.005918000042902834, + "loss": 3.0435, + "mean_token_accuracy": 0.42488807439804077, + "num_tokens": 6620017085.0, + "step": 12949 + }, + { + "epoch": 3.5018929150892375, + "grad_norm": 2.578125, + "learning_rate": 0.005916699014276816, + "loss": 2.8134, + "mean_token_accuracy": 0.4150697588920593, + "num_tokens": 6620516172.0, + "step": 12950 + }, + { + "epoch": 3.502163331530557, + "grad_norm": 162.0, + "learning_rate": 0.005915398141621424, + "loss": 11.4036, + "mean_token_accuracy": 0.014335673302412033, + "num_tokens": 6621040453.0, + "step": 12951 + }, + { + "epoch": 3.502433747971877, + "grad_norm": 5.65625, + "learning_rate": 0.005914097424976564, + "loss": 3.1882, + "mean_token_accuracy": 0.4070446193218231, + "num_tokens": 6621531663.0, + "step": 12952 + }, + { + "epoch": 3.5027041644131964, + "grad_norm": 2.515625, + "learning_rate": 0.005912796864382155, + "loss": 2.9058, + "mean_token_accuracy": 0.4146835207939148, + "num_tokens": 6622055899.0, + "step": 12953 + }, + { + "epoch": 3.502974580854516, + "grad_norm": 2.75, + "learning_rate": 0.0059114964598781005, + "loss": 3.1077, + "mean_token_accuracy": 0.4036298990249634, + "num_tokens": 6622580150.0, + "step": 12954 + }, + { + "epoch": 3.5032449972958357, + "grad_norm": 2.9375, + "learning_rate": 0.005910196211504294, + "loss": 2.78, + "mean_token_accuracy": 0.4314514398574829, + "num_tokens": 6623104320.0, + "step": 12955 + }, + { + "epoch": 3.5035154137371554, + "grad_norm": 3.375, + "learning_rate": 0.00590889611930064, + "loss": 3.0036, + "mean_token_accuracy": 0.41195082664489746, + "num_tokens": 6623628564.0, + "step": 12956 + }, + { + "epoch": 3.503785830178475, + "grad_norm": 3.15625, + "learning_rate": 0.00590759618330702, + "loss": 3.0112, + "mean_token_accuracy": 0.4246666431427002, + "num_tokens": 6624105105.0, + "step": 12957 + }, + { + "epoch": 3.5040562466197946, + "grad_norm": 3.28125, + "learning_rate": 0.0059062964035633304, + "loss": 2.968, + "mean_token_accuracy": 0.3999568819999695, + "num_tokens": 6624629151.0, + "step": 12958 + }, + { + "epoch": 3.5043266630611143, + "grad_norm": 2.265625, + "learning_rate": 0.005904996780109446, + "loss": 2.6973, + "mean_token_accuracy": 0.4378897547721863, + "num_tokens": 6625143143.0, + "step": 12959 + }, + { + "epoch": 3.5045970795024335, + "grad_norm": 2.640625, + "learning_rate": 0.005903697312985241, + "loss": 3.0007, + "mean_token_accuracy": 0.4194929599761963, + "num_tokens": 6625667194.0, + "step": 12960 + }, + { + "epoch": 3.5048674959437536, + "grad_norm": 2.875, + "learning_rate": 0.005902398002230589, + "loss": 2.9298, + "mean_token_accuracy": 0.4247085750102997, + "num_tokens": 6626161327.0, + "step": 12961 + }, + { + "epoch": 3.5051379123850728, + "grad_norm": 3.21875, + "learning_rate": 0.005901098847885361, + "loss": 2.8798, + "mean_token_accuracy": 0.4289701581001282, + "num_tokens": 6626662120.0, + "step": 12962 + }, + { + "epoch": 3.505408328826393, + "grad_norm": 2.765625, + "learning_rate": 0.005899799849989411, + "loss": 2.891, + "mean_token_accuracy": 0.4466423988342285, + "num_tokens": 6627133041.0, + "step": 12963 + }, + { + "epoch": 3.505678745267712, + "grad_norm": 2.703125, + "learning_rate": 0.0058985010085826056, + "loss": 2.8951, + "mean_token_accuracy": 0.4260416626930237, + "num_tokens": 6627657249.0, + "step": 12964 + }, + { + "epoch": 3.505949161709032, + "grad_norm": 3.765625, + "learning_rate": 0.005897202323704789, + "loss": 2.6155, + "mean_token_accuracy": 0.4613950550556183, + "num_tokens": 6628139269.0, + "step": 12965 + }, + { + "epoch": 3.5062195781503513, + "grad_norm": 2.578125, + "learning_rate": 0.005895903795395809, + "loss": 2.7437, + "mean_token_accuracy": 0.45664092898368835, + "num_tokens": 6628630684.0, + "step": 12966 + }, + { + "epoch": 3.5064899945916714, + "grad_norm": 2.609375, + "learning_rate": 0.005894605423695516, + "loss": 2.7231, + "mean_token_accuracy": 0.4342789351940155, + "num_tokens": 6629154802.0, + "step": 12967 + }, + { + "epoch": 3.5067604110329906, + "grad_norm": 2.703125, + "learning_rate": 0.005893307208643742, + "loss": 2.8505, + "mean_token_accuracy": 0.4272316098213196, + "num_tokens": 6629678955.0, + "step": 12968 + }, + { + "epoch": 3.5070308274743103, + "grad_norm": 2.796875, + "learning_rate": 0.005892009150280316, + "loss": 3.0218, + "mean_token_accuracy": 0.4223043620586395, + "num_tokens": 6630203102.0, + "step": 12969 + }, + { + "epoch": 3.50730124391563, + "grad_norm": 2.8125, + "learning_rate": 0.005890711248645076, + "loss": 3.01, + "mean_token_accuracy": 0.4155966341495514, + "num_tokens": 6630727366.0, + "step": 12970 + }, + { + "epoch": 3.5075716603569496, + "grad_norm": 15.1875, + "learning_rate": 0.0058894135037778385, + "loss": 9.3768, + "mean_token_accuracy": 0.027560226619243622, + "num_tokens": 6631192001.0, + "step": 12971 + }, + { + "epoch": 3.507842076798269, + "grad_norm": 6.15625, + "learning_rate": 0.005888115915718421, + "loss": 3.0913, + "mean_token_accuracy": 0.40448087453842163, + "num_tokens": 6631667299.0, + "step": 12972 + }, + { + "epoch": 3.508112493239589, + "grad_norm": 1.9453125, + "learning_rate": 0.005886818484506644, + "loss": 2.7631, + "mean_token_accuracy": 0.42614057660102844, + "num_tokens": 6632191371.0, + "step": 12973 + }, + { + "epoch": 3.5083829096809085, + "grad_norm": 2.578125, + "learning_rate": 0.005885521210182309, + "loss": 2.9829, + "mean_token_accuracy": 0.40162986516952515, + "num_tokens": 6632715631.0, + "step": 12974 + }, + { + "epoch": 3.508653326122228, + "grad_norm": 4.09375, + "learning_rate": 0.005884224092785224, + "loss": 3.1483, + "mean_token_accuracy": 0.4116195738315582, + "num_tokens": 6633239854.0, + "step": 12975 + }, + { + "epoch": 3.5089237425635478, + "grad_norm": 3.578125, + "learning_rate": 0.005882927132355193, + "loss": 3.079, + "mean_token_accuracy": 0.41333532333374023, + "num_tokens": 6633753293.0, + "step": 12976 + }, + { + "epoch": 3.5091941590048674, + "grad_norm": 2.8125, + "learning_rate": 0.005881630328932004, + "loss": 2.8068, + "mean_token_accuracy": 0.4346763491630554, + "num_tokens": 6634277527.0, + "step": 12977 + }, + { + "epoch": 3.509464575446187, + "grad_norm": 2.9375, + "learning_rate": 0.005880333682555448, + "loss": 2.9593, + "mean_token_accuracy": 0.41303375363349915, + "num_tokens": 6634801798.0, + "step": 12978 + }, + { + "epoch": 3.5097349918875067, + "grad_norm": 3.296875, + "learning_rate": 0.005879037193265311, + "loss": 3.046, + "mean_token_accuracy": 0.4276495575904846, + "num_tokens": 6635238939.0, + "step": 12979 + }, + { + "epoch": 3.5100054083288263, + "grad_norm": 3.328125, + "learning_rate": 0.00587774086110137, + "loss": 3.0566, + "mean_token_accuracy": 0.4103599488735199, + "num_tokens": 6635763205.0, + "step": 12980 + }, + { + "epoch": 3.510275824770146, + "grad_norm": 3.46875, + "learning_rate": 0.005876444686103407, + "loss": 2.802, + "mean_token_accuracy": 0.4287704825401306, + "num_tokens": 6636287336.0, + "step": 12981 + }, + { + "epoch": 3.5105462412114656, + "grad_norm": 3.125, + "learning_rate": 0.0058751486683111896, + "loss": 2.8454, + "mean_token_accuracy": 0.4752872586250305, + "num_tokens": 6636733668.0, + "step": 12982 + }, + { + "epoch": 3.5108166576527853, + "grad_norm": 2.609375, + "learning_rate": 0.005873852807764477, + "loss": 3.0785, + "mean_token_accuracy": 0.4132503271102905, + "num_tokens": 6637210995.0, + "step": 12983 + }, + { + "epoch": 3.511087074094105, + "grad_norm": 3.546875, + "learning_rate": 0.00587255710450304, + "loss": 3.0889, + "mean_token_accuracy": 0.4289543032646179, + "num_tokens": 6637673947.0, + "step": 12984 + }, + { + "epoch": 3.5113574905354246, + "grad_norm": 2.734375, + "learning_rate": 0.005871261558566627, + "loss": 2.9996, + "mean_token_accuracy": 0.3828495740890503, + "num_tokens": 6638198190.0, + "step": 12985 + }, + { + "epoch": 3.511627906976744, + "grad_norm": 2.5, + "learning_rate": 0.005869966169994993, + "loss": 2.8583, + "mean_token_accuracy": 0.4385169446468353, + "num_tokens": 6638722463.0, + "step": 12986 + }, + { + "epoch": 3.511898323418064, + "grad_norm": 3.21875, + "learning_rate": 0.005868670938827884, + "loss": 2.8667, + "mean_token_accuracy": 0.44811195135116577, + "num_tokens": 6639208254.0, + "step": 12987 + }, + { + "epoch": 3.5121687398593835, + "grad_norm": 2.765625, + "learning_rate": 0.005867375865105038, + "loss": 2.8631, + "mean_token_accuracy": 0.42890024185180664, + "num_tokens": 6639732524.0, + "step": 12988 + }, + { + "epoch": 3.512439156300703, + "grad_norm": 3.515625, + "learning_rate": 0.005866080948866197, + "loss": 2.6554, + "mean_token_accuracy": 0.43452930450439453, + "num_tokens": 6640256703.0, + "step": 12989 + }, + { + "epoch": 3.512709572742023, + "grad_norm": 3.09375, + "learning_rate": 0.005864786190151088, + "loss": 2.9837, + "mean_token_accuracy": 0.42886221408843994, + "num_tokens": 6640780843.0, + "step": 12990 + }, + { + "epoch": 3.5129799891833424, + "grad_norm": 39.0, + "learning_rate": 0.005863491588999443, + "loss": 10.5596, + "mean_token_accuracy": 0.023410018533468246, + "num_tokens": 6641305119.0, + "step": 12991 + }, + { + "epoch": 3.513250405624662, + "grad_norm": 7.8125, + "learning_rate": 0.005862197145450979, + "loss": 3.223, + "mean_token_accuracy": 0.3973386883735657, + "num_tokens": 6641829321.0, + "step": 12992 + }, + { + "epoch": 3.5135208220659817, + "grad_norm": 2.453125, + "learning_rate": 0.0058609028595454195, + "loss": 2.9552, + "mean_token_accuracy": 0.43520301580429077, + "num_tokens": 6642340482.0, + "step": 12993 + }, + { + "epoch": 3.5137912385073014, + "grad_norm": 4.0625, + "learning_rate": 0.005859608731322472, + "loss": 3.1849, + "mean_token_accuracy": 0.41224807500839233, + "num_tokens": 6642864735.0, + "step": 12994 + }, + { + "epoch": 3.514061654948621, + "grad_norm": 3.875, + "learning_rate": 0.005858314760821844, + "loss": 2.8654, + "mean_token_accuracy": 0.4102547764778137, + "num_tokens": 6643354208.0, + "step": 12995 + }, + { + "epoch": 3.5143320713899406, + "grad_norm": 2.296875, + "learning_rate": 0.005857020948083241, + "loss": 2.9472, + "mean_token_accuracy": 0.4179636240005493, + "num_tokens": 6643878366.0, + "step": 12996 + }, + { + "epoch": 3.5146024878312603, + "grad_norm": 2.421875, + "learning_rate": 0.005855727293146362, + "loss": 2.9634, + "mean_token_accuracy": 0.42382413148880005, + "num_tokens": 6644402547.0, + "step": 12997 + }, + { + "epoch": 3.51487290427258, + "grad_norm": 2.609375, + "learning_rate": 0.005854433796050893, + "loss": 2.9393, + "mean_token_accuracy": 0.41139256954193115, + "num_tokens": 6644926741.0, + "step": 12998 + }, + { + "epoch": 3.5151433207138996, + "grad_norm": 3.078125, + "learning_rate": 0.005853140456836532, + "loss": 2.8562, + "mean_token_accuracy": 0.44746553897857666, + "num_tokens": 6645414301.0, + "step": 12999 + }, + { + "epoch": 3.515413737155219, + "grad_norm": 2.828125, + "learning_rate": 0.005851847275542957, + "loss": 2.6816, + "mean_token_accuracy": 0.4276629090309143, + "num_tokens": 6645938472.0, + "step": 13000 + }, + { + "epoch": 3.5156841535965384, + "grad_norm": 2.296875, + "learning_rate": 0.005850554252209843, + "loss": 2.7789, + "mean_token_accuracy": 0.43719831109046936, + "num_tokens": 6646462667.0, + "step": 13001 + }, + { + "epoch": 3.5159545700378585, + "grad_norm": 3.53125, + "learning_rate": 0.005849261386876873, + "loss": 2.9751, + "mean_token_accuracy": 0.42711687088012695, + "num_tokens": 6646986746.0, + "step": 13002 + }, + { + "epoch": 3.5162249864791777, + "grad_norm": 2.84375, + "learning_rate": 0.005847968679583707, + "loss": 3.0682, + "mean_token_accuracy": 0.39153048396110535, + "num_tokens": 6647511028.0, + "step": 13003 + }, + { + "epoch": 3.516495402920498, + "grad_norm": 3.03125, + "learning_rate": 0.005846676130370014, + "loss": 3.098, + "mean_token_accuracy": 0.4249606430530548, + "num_tokens": 6647983682.0, + "step": 13004 + }, + { + "epoch": 3.516765819361817, + "grad_norm": 2.734375, + "learning_rate": 0.005845383739275454, + "loss": 2.8615, + "mean_token_accuracy": 0.4220098853111267, + "num_tokens": 6648507950.0, + "step": 13005 + }, + { + "epoch": 3.517036235803137, + "grad_norm": 2.78125, + "learning_rate": 0.005844091506339677, + "loss": 2.8595, + "mean_token_accuracy": 0.4234807789325714, + "num_tokens": 6649032073.0, + "step": 13006 + }, + { + "epoch": 3.5173066522444563, + "grad_norm": 3.234375, + "learning_rate": 0.005842799431602337, + "loss": 2.9739, + "mean_token_accuracy": 0.422037273645401, + "num_tokens": 6649556219.0, + "step": 13007 + }, + { + "epoch": 3.5175770686857764, + "grad_norm": 3.234375, + "learning_rate": 0.005841507515103079, + "loss": 2.9692, + "mean_token_accuracy": 0.41437503695487976, + "num_tokens": 6650080361.0, + "step": 13008 + }, + { + "epoch": 3.5178474851270956, + "grad_norm": 3.453125, + "learning_rate": 0.005840215756881538, + "loss": 2.7935, + "mean_token_accuracy": 0.4573206901550293, + "num_tokens": 6650604564.0, + "step": 13009 + }, + { + "epoch": 3.518117901568415, + "grad_norm": 2.640625, + "learning_rate": 0.005838924156977352, + "loss": 2.8301, + "mean_token_accuracy": 0.447118878364563, + "num_tokens": 6651068972.0, + "step": 13010 + }, + { + "epoch": 3.518388318009735, + "grad_norm": 72.5, + "learning_rate": 0.0058376327154301515, + "loss": 12.6207, + "mean_token_accuracy": 6.618583029194269e-06, + "num_tokens": 6651593071.0, + "step": 13011 + }, + { + "epoch": 3.5186587344510545, + "grad_norm": 9.125, + "learning_rate": 0.005836341432279559, + "loss": 3.0613, + "mean_token_accuracy": 0.40699875354766846, + "num_tokens": 6652070120.0, + "step": 13012 + }, + { + "epoch": 3.518929150892374, + "grad_norm": 2.375, + "learning_rate": 0.005835050307565197, + "loss": 2.8916, + "mean_token_accuracy": 0.4330359101295471, + "num_tokens": 6652594347.0, + "step": 13013 + }, + { + "epoch": 3.5191995673336938, + "grad_norm": 2.203125, + "learning_rate": 0.005833759341326683, + "loss": 2.7741, + "mean_token_accuracy": 0.43497234582901, + "num_tokens": 6653118535.0, + "step": 13014 + }, + { + "epoch": 3.5194699837750134, + "grad_norm": 2.046875, + "learning_rate": 0.0058324685336036235, + "loss": 2.8225, + "mean_token_accuracy": 0.45805951952934265, + "num_tokens": 6653532419.0, + "step": 13015 + }, + { + "epoch": 3.519740400216333, + "grad_norm": 2.9375, + "learning_rate": 0.005831177884435625, + "loss": 3.0493, + "mean_token_accuracy": 0.41189801692962646, + "num_tokens": 6654056697.0, + "step": 13016 + }, + { + "epoch": 3.5200108166576527, + "grad_norm": 3.15625, + "learning_rate": 0.00582988739386229, + "loss": 2.9035, + "mean_token_accuracy": 0.4187227189540863, + "num_tokens": 6654580939.0, + "step": 13017 + }, + { + "epoch": 3.5202812330989723, + "grad_norm": 3.28125, + "learning_rate": 0.0058285970619232136, + "loss": 2.8199, + "mean_token_accuracy": 0.4650489091873169, + "num_tokens": 6655105209.0, + "step": 13018 + }, + { + "epoch": 3.520551649540292, + "grad_norm": 2.75, + "learning_rate": 0.005827306888657988, + "loss": 2.7, + "mean_token_accuracy": 0.444579541683197, + "num_tokens": 6655629288.0, + "step": 13019 + }, + { + "epoch": 3.5208220659816116, + "grad_norm": 3.15625, + "learning_rate": 0.005826016874106201, + "loss": 2.8808, + "mean_token_accuracy": 0.42972493171691895, + "num_tokens": 6656153552.0, + "step": 13020 + }, + { + "epoch": 3.5210924824229313, + "grad_norm": 3.234375, + "learning_rate": 0.005824727018307429, + "loss": 2.9758, + "mean_token_accuracy": 0.4182193875312805, + "num_tokens": 6656657038.0, + "step": 13021 + }, + { + "epoch": 3.521362898864251, + "grad_norm": 3.3125, + "learning_rate": 0.005823437321301255, + "loss": 2.9797, + "mean_token_accuracy": 0.44311583042144775, + "num_tokens": 6657117148.0, + "step": 13022 + }, + { + "epoch": 3.5216333153055706, + "grad_norm": 2.625, + "learning_rate": 0.005822147783127248, + "loss": 2.6323, + "mean_token_accuracy": 0.42385029792785645, + "num_tokens": 6657641301.0, + "step": 13023 + }, + { + "epoch": 3.52190373174689, + "grad_norm": 2.625, + "learning_rate": 0.0058208584038249665, + "loss": 2.8236, + "mean_token_accuracy": 0.4386795163154602, + "num_tokens": 6658165484.0, + "step": 13024 + }, + { + "epoch": 3.52217414818821, + "grad_norm": 3.34375, + "learning_rate": 0.005819569183433988, + "loss": 2.8614, + "mean_token_accuracy": 0.4284084141254425, + "num_tokens": 6658689662.0, + "step": 13025 + }, + { + "epoch": 3.5224445646295295, + "grad_norm": 3.6875, + "learning_rate": 0.0058182801219938554, + "loss": 3.0057, + "mean_token_accuracy": 0.42683538794517517, + "num_tokens": 6659213853.0, + "step": 13026 + }, + { + "epoch": 3.522714981070849, + "grad_norm": 3.921875, + "learning_rate": 0.0058169912195441325, + "loss": 2.8174, + "mean_token_accuracy": 0.43034952878952026, + "num_tokens": 6659733672.0, + "step": 13027 + }, + { + "epoch": 3.5229853975121688, + "grad_norm": 2.953125, + "learning_rate": 0.00581570247612436, + "loss": 2.8589, + "mean_token_accuracy": 0.4359481930732727, + "num_tokens": 6660239357.0, + "step": 13028 + }, + { + "epoch": 3.5232558139534884, + "grad_norm": 3.40625, + "learning_rate": 0.00581441389177408, + "loss": 3.1202, + "mean_token_accuracy": 0.4119013249874115, + "num_tokens": 6660763511.0, + "step": 13029 + }, + { + "epoch": 3.523526230394808, + "grad_norm": 2.96875, + "learning_rate": 0.005813125466532831, + "loss": 3.0586, + "mean_token_accuracy": 0.40872490406036377, + "num_tokens": 6661287794.0, + "step": 13030 + }, + { + "epoch": 3.5237966468361277, + "grad_norm": 101.0, + "learning_rate": 0.0058118372004401465, + "loss": 11.2747, + "mean_token_accuracy": 0.022432517260313034, + "num_tokens": 6661811993.0, + "step": 13031 + }, + { + "epoch": 3.5240670632774473, + "grad_norm": 5.8125, + "learning_rate": 0.005810549093535557, + "loss": 3.0434, + "mean_token_accuracy": 0.40661120414733887, + "num_tokens": 6662336178.0, + "step": 13032 + }, + { + "epoch": 3.524337479718767, + "grad_norm": 2.25, + "learning_rate": 0.005809261145858578, + "loss": 2.8863, + "mean_token_accuracy": 0.4145905375480652, + "num_tokens": 6662860351.0, + "step": 13033 + }, + { + "epoch": 3.5246078961600866, + "grad_norm": 2.875, + "learning_rate": 0.005807973357448736, + "loss": 2.9207, + "mean_token_accuracy": 0.41956761479377747, + "num_tokens": 6663384531.0, + "step": 13034 + }, + { + "epoch": 3.5248783126014063, + "grad_norm": 3.09375, + "learning_rate": 0.0058066857283455365, + "loss": 2.9282, + "mean_token_accuracy": 0.4354363679885864, + "num_tokens": 6663878964.0, + "step": 13035 + }, + { + "epoch": 3.525148729042726, + "grad_norm": 3.703125, + "learning_rate": 0.005805398258588494, + "loss": 3.0955, + "mean_token_accuracy": 0.4056604504585266, + "num_tokens": 6664403106.0, + "step": 13036 + }, + { + "epoch": 3.5254191454840456, + "grad_norm": 2.65625, + "learning_rate": 0.005804110948217112, + "loss": 2.7251, + "mean_token_accuracy": 0.4243099093437195, + "num_tokens": 6664927271.0, + "step": 13037 + }, + { + "epoch": 3.525689561925365, + "grad_norm": 3.015625, + "learning_rate": 0.005802823797270881, + "loss": 2.8112, + "mean_token_accuracy": 0.4329347610473633, + "num_tokens": 6665451430.0, + "step": 13038 + }, + { + "epoch": 3.525959978366685, + "grad_norm": 6.46875, + "learning_rate": 0.005801536805789303, + "loss": 2.5383, + "mean_token_accuracy": 0.4573129713535309, + "num_tokens": 6665975570.0, + "step": 13039 + }, + { + "epoch": 3.5262303948080045, + "grad_norm": 2.328125, + "learning_rate": 0.005800249973811864, + "loss": 3.025, + "mean_token_accuracy": 0.43293535709381104, + "num_tokens": 6666460486.0, + "step": 13040 + }, + { + "epoch": 3.526500811249324, + "grad_norm": 3.546875, + "learning_rate": 0.005798963301378043, + "loss": 2.8551, + "mean_token_accuracy": 0.44302380084991455, + "num_tokens": 6666984637.0, + "step": 13041 + }, + { + "epoch": 3.5267712276906433, + "grad_norm": 2.921875, + "learning_rate": 0.005797676788527328, + "loss": 2.9421, + "mean_token_accuracy": 0.4050544202327728, + "num_tokens": 6667508746.0, + "step": 13042 + }, + { + "epoch": 3.5270416441319634, + "grad_norm": 3.15625, + "learning_rate": 0.005796390435299187, + "loss": 2.8766, + "mean_token_accuracy": 0.4491717219352722, + "num_tokens": 6668032956.0, + "step": 13043 + }, + { + "epoch": 3.5273120605732826, + "grad_norm": 3.53125, + "learning_rate": 0.005795104241733086, + "loss": 2.9319, + "mean_token_accuracy": 0.414900541305542, + "num_tokens": 6668556840.0, + "step": 13044 + }, + { + "epoch": 3.5275824770146027, + "grad_norm": 3.328125, + "learning_rate": 0.005793818207868495, + "loss": 2.9829, + "mean_token_accuracy": 0.4133960008621216, + "num_tokens": 6669081095.0, + "step": 13045 + }, + { + "epoch": 3.527852893455922, + "grad_norm": 2.953125, + "learning_rate": 0.005792532333744875, + "loss": 2.8801, + "mean_token_accuracy": 0.4297561049461365, + "num_tokens": 6669605342.0, + "step": 13046 + }, + { + "epoch": 3.528123309897242, + "grad_norm": 3.015625, + "learning_rate": 0.005791246619401675, + "loss": 2.9819, + "mean_token_accuracy": 0.4144601821899414, + "num_tokens": 6670129591.0, + "step": 13047 + }, + { + "epoch": 3.528393726338561, + "grad_norm": 2.921875, + "learning_rate": 0.005789961064878347, + "loss": 2.7106, + "mean_token_accuracy": 0.4372791051864624, + "num_tokens": 6670605745.0, + "step": 13048 + }, + { + "epoch": 3.5286641427798813, + "grad_norm": 2.75, + "learning_rate": 0.005788675670214334, + "loss": 3.0351, + "mean_token_accuracy": 0.4427158832550049, + "num_tokens": 6671076367.0, + "step": 13049 + }, + { + "epoch": 3.5289345592212005, + "grad_norm": 3.828125, + "learning_rate": 0.00578739043544908, + "loss": 2.8366, + "mean_token_accuracy": 0.4240655303001404, + "num_tokens": 6671600615.0, + "step": 13050 + }, + { + "epoch": 3.52920497566252, + "grad_norm": 12.75, + "learning_rate": 0.005786105360622017, + "loss": 8.5172, + "mean_token_accuracy": 0.01583358272910118, + "num_tokens": 6672124819.0, + "step": 13051 + }, + { + "epoch": 3.5294753921038398, + "grad_norm": 5.59375, + "learning_rate": 0.0057848204457725694, + "loss": 3.0277, + "mean_token_accuracy": 0.41750872135162354, + "num_tokens": 6672649023.0, + "step": 13052 + }, + { + "epoch": 3.5297458085451594, + "grad_norm": 2.734375, + "learning_rate": 0.005783535690940173, + "loss": 2.7654, + "mean_token_accuracy": 0.4548885226249695, + "num_tokens": 6673173299.0, + "step": 13053 + }, + { + "epoch": 3.530016224986479, + "grad_norm": 3.03125, + "learning_rate": 0.005782251096164242, + "loss": 2.8783, + "mean_token_accuracy": 0.40611785650253296, + "num_tokens": 6673697515.0, + "step": 13054 + }, + { + "epoch": 3.5302866414277987, + "grad_norm": 2.984375, + "learning_rate": 0.005780966661484187, + "loss": 2.9108, + "mean_token_accuracy": 0.4082198441028595, + "num_tokens": 6674221731.0, + "step": 13055 + }, + { + "epoch": 3.5305570578691183, + "grad_norm": 2.609375, + "learning_rate": 0.005779682386939428, + "loss": 2.8238, + "mean_token_accuracy": 0.4210793673992157, + "num_tokens": 6674745779.0, + "step": 13056 + }, + { + "epoch": 3.530827474310438, + "grad_norm": 3.21875, + "learning_rate": 0.005778398272569363, + "loss": 2.7439, + "mean_token_accuracy": 0.47385793924331665, + "num_tokens": 6675269856.0, + "step": 13057 + }, + { + "epoch": 3.5310978907517576, + "grad_norm": 2.328125, + "learning_rate": 0.005777114318413394, + "loss": 2.6478, + "mean_token_accuracy": 0.43655914068222046, + "num_tokens": 6675794114.0, + "step": 13058 + }, + { + "epoch": 3.5313683071930773, + "grad_norm": 25.375, + "learning_rate": 0.005775830524510919, + "loss": 2.8029, + "mean_token_accuracy": 0.41629576683044434, + "num_tokens": 6676318372.0, + "step": 13059 + }, + { + "epoch": 3.531638723634397, + "grad_norm": 4.1875, + "learning_rate": 0.005774546890901324, + "loss": 2.981, + "mean_token_accuracy": 0.4089616537094116, + "num_tokens": 6676817819.0, + "step": 13060 + }, + { + "epoch": 3.5319091400757165, + "grad_norm": 3.671875, + "learning_rate": 0.005773263417623998, + "loss": 2.5714, + "mean_token_accuracy": 0.46820396184921265, + "num_tokens": 6677342054.0, + "step": 13061 + }, + { + "epoch": 3.532179556517036, + "grad_norm": 2.1875, + "learning_rate": 0.005771980104718323, + "loss": 2.8438, + "mean_token_accuracy": 0.43311476707458496, + "num_tokens": 6677866304.0, + "step": 13062 + }, + { + "epoch": 3.532449972958356, + "grad_norm": 2.328125, + "learning_rate": 0.005770696952223675, + "loss": 2.9164, + "mean_token_accuracy": 0.4305613338947296, + "num_tokens": 6678390484.0, + "step": 13063 + }, + { + "epoch": 3.5327203893996755, + "grad_norm": 3.78125, + "learning_rate": 0.005769413960179417, + "loss": 2.9967, + "mean_token_accuracy": 0.40593773126602173, + "num_tokens": 6678914669.0, + "step": 13064 + }, + { + "epoch": 3.532990805840995, + "grad_norm": 2.59375, + "learning_rate": 0.005768131128624925, + "loss": 2.9203, + "mean_token_accuracy": 0.40900635719299316, + "num_tokens": 6679438935.0, + "step": 13065 + }, + { + "epoch": 3.5332612222823148, + "grad_norm": 3.375, + "learning_rate": 0.005766848457599557, + "loss": 3.0031, + "mean_token_accuracy": 0.4026566445827484, + "num_tokens": 6679963033.0, + "step": 13066 + }, + { + "epoch": 3.5335316387236344, + "grad_norm": 2.796875, + "learning_rate": 0.005765565947142663, + "loss": 3.1092, + "mean_token_accuracy": 0.4147099256515503, + "num_tokens": 6680487210.0, + "step": 13067 + }, + { + "epoch": 3.533802055164954, + "grad_norm": 3.796875, + "learning_rate": 0.005764283597293605, + "loss": 2.951, + "mean_token_accuracy": 0.44345855712890625, + "num_tokens": 6680895639.0, + "step": 13068 + }, + { + "epoch": 3.5340724716062737, + "grad_norm": 2.671875, + "learning_rate": 0.005763001408091723, + "loss": 2.71, + "mean_token_accuracy": 0.43953022360801697, + "num_tokens": 6681381564.0, + "step": 13069 + }, + { + "epoch": 3.5343428880475933, + "grad_norm": 2.53125, + "learning_rate": 0.005761719379576353, + "loss": 2.9344, + "mean_token_accuracy": 0.422218918800354, + "num_tokens": 6681854880.0, + "step": 13070 + }, + { + "epoch": 3.534613304488913, + "grad_norm": 51.75, + "learning_rate": 0.005760437511786842, + "loss": 10.0518, + "mean_token_accuracy": 0.02483813650906086, + "num_tokens": 6682339218.0, + "step": 13071 + }, + { + "epoch": 3.5348837209302326, + "grad_norm": 6.15625, + "learning_rate": 0.005759155804762515, + "loss": 2.9558, + "mean_token_accuracy": 0.40643078088760376, + "num_tokens": 6682863311.0, + "step": 13072 + }, + { + "epoch": 3.5351541373715523, + "grad_norm": 2.28125, + "learning_rate": 0.005757874258542702, + "loss": 3.1225, + "mean_token_accuracy": 0.4009120464324951, + "num_tokens": 6683387398.0, + "step": 13073 + }, + { + "epoch": 3.535424553812872, + "grad_norm": 2.5, + "learning_rate": 0.005756592873166719, + "loss": 2.8006, + "mean_token_accuracy": 0.4341018795967102, + "num_tokens": 6683911543.0, + "step": 13074 + }, + { + "epoch": 3.5356949702541915, + "grad_norm": 2.78125, + "learning_rate": 0.005755311648673886, + "loss": 2.9535, + "mean_token_accuracy": 0.4432660937309265, + "num_tokens": 6684403210.0, + "step": 13075 + }, + { + "epoch": 3.535965386695511, + "grad_norm": 2.421875, + "learning_rate": 0.0057540305851035214, + "loss": 2.9094, + "mean_token_accuracy": 0.42296111583709717, + "num_tokens": 6684927404.0, + "step": 13076 + }, + { + "epoch": 3.536235803136831, + "grad_norm": 2.59375, + "learning_rate": 0.005752749682494924, + "loss": 2.7776, + "mean_token_accuracy": 0.4334526062011719, + "num_tokens": 6685451611.0, + "step": 13077 + }, + { + "epoch": 3.5365062195781505, + "grad_norm": 27.125, + "learning_rate": 0.005751468940887394, + "loss": 3.0607, + "mean_token_accuracy": 0.44002649188041687, + "num_tokens": 6685937285.0, + "step": 13078 + }, + { + "epoch": 3.53677663601947, + "grad_norm": 5.4375, + "learning_rate": 0.005750188360320235, + "loss": 3.1135, + "mean_token_accuracy": 0.41226276755332947, + "num_tokens": 6686461469.0, + "step": 13079 + }, + { + "epoch": 3.5370470524607898, + "grad_norm": 2.6875, + "learning_rate": 0.005748907940832736, + "loss": 3.0005, + "mean_token_accuracy": 0.420071005821228, + "num_tokens": 6686942609.0, + "step": 13080 + }, + { + "epoch": 3.5373174689021094, + "grad_norm": 3.375, + "learning_rate": 0.005747627682464181, + "loss": 2.8562, + "mean_token_accuracy": 0.4188612997531891, + "num_tokens": 6687466748.0, + "step": 13081 + }, + { + "epoch": 3.537587885343429, + "grad_norm": 2.96875, + "learning_rate": 0.005746347585253858, + "loss": 3.016, + "mean_token_accuracy": 0.44221031665802, + "num_tokens": 6687990962.0, + "step": 13082 + }, + { + "epoch": 3.5378583017847482, + "grad_norm": 3.234375, + "learning_rate": 0.0057450676492410415, + "loss": 2.994, + "mean_token_accuracy": 0.4072135090827942, + "num_tokens": 6688515177.0, + "step": 13083 + }, + { + "epoch": 3.5381287182260683, + "grad_norm": 2.890625, + "learning_rate": 0.005743787874464999, + "loss": 2.7277, + "mean_token_accuracy": 0.4414765238761902, + "num_tokens": 6689008120.0, + "step": 13084 + }, + { + "epoch": 3.5383991346673875, + "grad_norm": 3.515625, + "learning_rate": 0.005742508260965006, + "loss": 2.7338, + "mean_token_accuracy": 0.4289920926094055, + "num_tokens": 6689532294.0, + "step": 13085 + }, + { + "epoch": 3.5386695511087076, + "grad_norm": 2.9375, + "learning_rate": 0.005741228808780321, + "loss": 3.1051, + "mean_token_accuracy": 0.40445464849472046, + "num_tokens": 6690056473.0, + "step": 13086 + }, + { + "epoch": 3.538939967550027, + "grad_norm": 2.703125, + "learning_rate": 0.005739949517950197, + "loss": 2.7602, + "mean_token_accuracy": 0.42611682415008545, + "num_tokens": 6690580574.0, + "step": 13087 + }, + { + "epoch": 3.539210383991347, + "grad_norm": 2.359375, + "learning_rate": 0.005738670388513892, + "loss": 2.898, + "mean_token_accuracy": 0.4394652843475342, + "num_tokens": 6691104707.0, + "step": 13088 + }, + { + "epoch": 3.539480800432666, + "grad_norm": 2.734375, + "learning_rate": 0.005737391420510654, + "loss": 3.0072, + "mean_token_accuracy": 0.40843436121940613, + "num_tokens": 6691628986.0, + "step": 13089 + }, + { + "epoch": 3.539751216873986, + "grad_norm": 2.40625, + "learning_rate": 0.00573611261397972, + "loss": 3.1429, + "mean_token_accuracy": 0.4215927720069885, + "num_tokens": 6692106719.0, + "step": 13090 + }, + { + "epoch": 3.5400216333153054, + "grad_norm": 18.0, + "learning_rate": 0.0057348339689603355, + "loss": 9.7994, + "mean_token_accuracy": 0.03555741161108017, + "num_tokens": 6692561376.0, + "step": 13091 + }, + { + "epoch": 3.540292049756625, + "grad_norm": 5.3125, + "learning_rate": 0.005733555485491728, + "loss": 3.0587, + "mean_token_accuracy": 0.41895365715026855, + "num_tokens": 6693085579.0, + "step": 13092 + }, + { + "epoch": 3.5405624661979447, + "grad_norm": 2.15625, + "learning_rate": 0.005732277163613123, + "loss": 3.1226, + "mean_token_accuracy": 0.40889933705329895, + "num_tokens": 6693609807.0, + "step": 13093 + }, + { + "epoch": 3.5408328826392643, + "grad_norm": 3.484375, + "learning_rate": 0.005730999003363748, + "loss": 3.0456, + "mean_token_accuracy": 0.41099947690963745, + "num_tokens": 6694106328.0, + "step": 13094 + }, + { + "epoch": 3.541103299080584, + "grad_norm": 4.6875, + "learning_rate": 0.0057297210047828175, + "loss": 2.7351, + "mean_token_accuracy": 0.4537106156349182, + "num_tokens": 6694630581.0, + "step": 13095 + }, + { + "epoch": 3.5413737155219036, + "grad_norm": 2.484375, + "learning_rate": 0.005728443167909547, + "loss": 3.0149, + "mean_token_accuracy": 0.42097893357276917, + "num_tokens": 6695154623.0, + "step": 13096 + }, + { + "epoch": 3.5416441319632233, + "grad_norm": 3.703125, + "learning_rate": 0.005727165492783144, + "loss": 3.0282, + "mean_token_accuracy": 0.409422367811203, + "num_tokens": 6695678841.0, + "step": 13097 + }, + { + "epoch": 3.541914548404543, + "grad_norm": 2.921875, + "learning_rate": 0.005725887979442806, + "loss": 2.643, + "mean_token_accuracy": 0.4440164268016815, + "num_tokens": 6696202945.0, + "step": 13098 + }, + { + "epoch": 3.5421849648458625, + "grad_norm": 2.8125, + "learning_rate": 0.00572461062792774, + "loss": 2.8505, + "mean_token_accuracy": 0.4458138048648834, + "num_tokens": 6696727170.0, + "step": 13099 + }, + { + "epoch": 3.542455381287182, + "grad_norm": 2.625, + "learning_rate": 0.005723333438277132, + "loss": 2.8359, + "mean_token_accuracy": 0.4360998868942261, + "num_tokens": 6697251442.0, + "step": 13100 + }, + { + "epoch": 3.542725797728502, + "grad_norm": 2.90625, + "learning_rate": 0.0057220564105301686, + "loss": 2.8804, + "mean_token_accuracy": 0.4170280694961548, + "num_tokens": 6697775514.0, + "step": 13101 + }, + { + "epoch": 3.5429962141698215, + "grad_norm": 2.265625, + "learning_rate": 0.005720779544726039, + "loss": 2.9887, + "mean_token_accuracy": 0.4227466285228729, + "num_tokens": 6698299641.0, + "step": 13102 + }, + { + "epoch": 3.543266630611141, + "grad_norm": 2.78125, + "learning_rate": 0.0057195028409039155, + "loss": 2.8744, + "mean_token_accuracy": 0.4603028893470764, + "num_tokens": 6698764357.0, + "step": 13103 + }, + { + "epoch": 3.5435370470524608, + "grad_norm": 3.125, + "learning_rate": 0.005718226299102973, + "loss": 3.0467, + "mean_token_accuracy": 0.39196494221687317, + "num_tokens": 6699288462.0, + "step": 13104 + }, + { + "epoch": 3.5438074634937804, + "grad_norm": 2.296875, + "learning_rate": 0.005716949919362384, + "loss": 2.7974, + "mean_token_accuracy": 0.44709911942481995, + "num_tokens": 6699812741.0, + "step": 13105 + }, + { + "epoch": 3.5440778799351, + "grad_norm": 2.875, + "learning_rate": 0.005715673701721309, + "loss": 2.8206, + "mean_token_accuracy": 0.44370388984680176, + "num_tokens": 6700282187.0, + "step": 13106 + }, + { + "epoch": 3.5443482963764197, + "grad_norm": 2.875, + "learning_rate": 0.005714397646218901, + "loss": 2.94, + "mean_token_accuracy": 0.4377177357673645, + "num_tokens": 6700806404.0, + "step": 13107 + }, + { + "epoch": 3.5446187128177393, + "grad_norm": 3.671875, + "learning_rate": 0.005713121752894319, + "loss": 2.8612, + "mean_token_accuracy": 0.4294443726539612, + "num_tokens": 6701305555.0, + "step": 13108 + }, + { + "epoch": 3.544889129259059, + "grad_norm": 3.34375, + "learning_rate": 0.005711846021786711, + "loss": 2.9354, + "mean_token_accuracy": 0.42610085010528564, + "num_tokens": 6701829765.0, + "step": 13109 + }, + { + "epoch": 3.5451595457003786, + "grad_norm": 3.140625, + "learning_rate": 0.005710570452935215, + "loss": 2.8452, + "mean_token_accuracy": 0.43831878900527954, + "num_tokens": 6702286151.0, + "step": 13110 + }, + { + "epoch": 3.5454299621416983, + "grad_norm": 23.0, + "learning_rate": 0.005709295046378975, + "loss": 10.7128, + "mean_token_accuracy": 0.007034034002572298, + "num_tokens": 6702810393.0, + "step": 13111 + }, + { + "epoch": 3.545700378583018, + "grad_norm": 5.53125, + "learning_rate": 0.005708019802157124, + "loss": 3.0586, + "mean_token_accuracy": 0.41447433829307556, + "num_tokens": 6703334621.0, + "step": 13112 + }, + { + "epoch": 3.5459707950243375, + "grad_norm": 2.078125, + "learning_rate": 0.005706744720308783, + "loss": 2.8749, + "mean_token_accuracy": 0.43825653195381165, + "num_tokens": 6703858849.0, + "step": 13113 + }, + { + "epoch": 3.546241211465657, + "grad_norm": 2.765625, + "learning_rate": 0.005705469800873084, + "loss": 2.8984, + "mean_token_accuracy": 0.4405079483985901, + "num_tokens": 6704383017.0, + "step": 13114 + }, + { + "epoch": 3.546511627906977, + "grad_norm": 3.296875, + "learning_rate": 0.005704195043889138, + "loss": 2.9787, + "mean_token_accuracy": 0.4015476107597351, + "num_tokens": 6704907272.0, + "step": 13115 + }, + { + "epoch": 3.5467820443482965, + "grad_norm": 2.671875, + "learning_rate": 0.0057029204493960674, + "loss": 2.8903, + "mean_token_accuracy": 0.41178733110427856, + "num_tokens": 6705431494.0, + "step": 13116 + }, + { + "epoch": 3.547052460789616, + "grad_norm": 3.84375, + "learning_rate": 0.005701646017432971, + "loss": 3.0779, + "mean_token_accuracy": 0.41579121351242065, + "num_tokens": 6705939116.0, + "step": 13117 + }, + { + "epoch": 3.5473228772309358, + "grad_norm": 10.5625, + "learning_rate": 0.005700371748038955, + "loss": 2.9831, + "mean_token_accuracy": 0.4201694428920746, + "num_tokens": 6706463268.0, + "step": 13118 + }, + { + "epoch": 3.5475932936722554, + "grad_norm": 2.28125, + "learning_rate": 0.005699097641253124, + "loss": 2.7787, + "mean_token_accuracy": 0.43803656101226807, + "num_tokens": 6706948707.0, + "step": 13119 + }, + { + "epoch": 3.547863710113575, + "grad_norm": 3.171875, + "learning_rate": 0.005697823697114563, + "loss": 2.8675, + "mean_token_accuracy": 0.4308614730834961, + "num_tokens": 6707472983.0, + "step": 13120 + }, + { + "epoch": 3.5481341265548947, + "grad_norm": 3.375, + "learning_rate": 0.005696549915662363, + "loss": 2.766, + "mean_token_accuracy": 0.4303645193576813, + "num_tokens": 6707997108.0, + "step": 13121 + }, + { + "epoch": 3.5484045429962143, + "grad_norm": 2.671875, + "learning_rate": 0.005695276296935612, + "loss": 2.8059, + "mean_token_accuracy": 0.4286888837814331, + "num_tokens": 6708521171.0, + "step": 13122 + }, + { + "epoch": 3.548674959437534, + "grad_norm": 2.875, + "learning_rate": 0.005694002840973384, + "loss": 2.9918, + "mean_token_accuracy": 0.4217061400413513, + "num_tokens": 6709045389.0, + "step": 13123 + }, + { + "epoch": 3.548945375878853, + "grad_norm": 2.703125, + "learning_rate": 0.005692729547814748, + "loss": 2.686, + "mean_token_accuracy": 0.44112908840179443, + "num_tokens": 6709544361.0, + "step": 13124 + }, + { + "epoch": 3.5492157923201733, + "grad_norm": 10.9375, + "learning_rate": 0.0056914564174987805, + "loss": 2.9169, + "mean_token_accuracy": 0.42949649691581726, + "num_tokens": 6710068452.0, + "step": 13125 + }, + { + "epoch": 3.5494862087614925, + "grad_norm": 3.09375, + "learning_rate": 0.0056901834500645435, + "loss": 3.0671, + "mean_token_accuracy": 0.4172537326812744, + "num_tokens": 6710592605.0, + "step": 13126 + }, + { + "epoch": 3.5497566252028125, + "grad_norm": 2.46875, + "learning_rate": 0.005688910645551088, + "loss": 3.1139, + "mean_token_accuracy": 0.4065406322479248, + "num_tokens": 6711116870.0, + "step": 13127 + }, + { + "epoch": 3.5500270416441317, + "grad_norm": 6.875, + "learning_rate": 0.005687638003997477, + "loss": 2.821, + "mean_token_accuracy": 0.4552774727344513, + "num_tokens": 6711641035.0, + "step": 13128 + }, + { + "epoch": 3.550297458085452, + "grad_norm": 2.265625, + "learning_rate": 0.005686365525442754, + "loss": 2.9562, + "mean_token_accuracy": 0.4106239974498749, + "num_tokens": 6712165181.0, + "step": 13129 + }, + { + "epoch": 3.550567874526771, + "grad_norm": 2.984375, + "learning_rate": 0.00568509320992596, + "loss": 2.8918, + "mean_token_accuracy": 0.4242292046546936, + "num_tokens": 6712689351.0, + "step": 13130 + }, + { + "epoch": 3.550838290968091, + "grad_norm": 8.8125, + "learning_rate": 0.005683821057486139, + "loss": 8.8379, + "mean_token_accuracy": 0.03319239988923073, + "num_tokens": 6713213541.0, + "step": 13131 + }, + { + "epoch": 3.5511087074094103, + "grad_norm": 6.0625, + "learning_rate": 0.0056825490681623216, + "loss": 3.0952, + "mean_token_accuracy": 0.42139798402786255, + "num_tokens": 6713689574.0, + "step": 13132 + }, + { + "epoch": 3.5513791238507304, + "grad_norm": 2.90625, + "learning_rate": 0.005681277241993534, + "loss": 2.8253, + "mean_token_accuracy": 0.4088556468486786, + "num_tokens": 6714213839.0, + "step": 13133 + }, + { + "epoch": 3.5516495402920496, + "grad_norm": 3.390625, + "learning_rate": 0.005680005579018805, + "loss": 2.9912, + "mean_token_accuracy": 0.4286586046218872, + "num_tokens": 6714738026.0, + "step": 13134 + }, + { + "epoch": 3.5519199567333692, + "grad_norm": 3.34375, + "learning_rate": 0.005678734079277152, + "loss": 3.0891, + "mean_token_accuracy": 0.41139891743659973, + "num_tokens": 6715251842.0, + "step": 13135 + }, + { + "epoch": 3.552190373174689, + "grad_norm": 2.75, + "learning_rate": 0.005677462742807582, + "loss": 2.9801, + "mean_token_accuracy": 0.43668803572654724, + "num_tokens": 6715712487.0, + "step": 13136 + }, + { + "epoch": 3.5524607896160085, + "grad_norm": 3.078125, + "learning_rate": 0.00567619156964911, + "loss": 2.7548, + "mean_token_accuracy": 0.4239121675491333, + "num_tokens": 6716205232.0, + "step": 13137 + }, + { + "epoch": 3.552731206057328, + "grad_norm": 2.71875, + "learning_rate": 0.005674920559840737, + "loss": 3.0111, + "mean_token_accuracy": 0.40019017457962036, + "num_tokens": 6716729451.0, + "step": 13138 + }, + { + "epoch": 3.553001622498648, + "grad_norm": 2.421875, + "learning_rate": 0.005673649713421458, + "loss": 2.6542, + "mean_token_accuracy": 0.4228931665420532, + "num_tokens": 6717253681.0, + "step": 13139 + }, + { + "epoch": 3.5532720389399675, + "grad_norm": 2.875, + "learning_rate": 0.005672379030430274, + "loss": 2.9344, + "mean_token_accuracy": 0.41924184560775757, + "num_tokens": 6717777960.0, + "step": 13140 + }, + { + "epoch": 3.553542455381287, + "grad_norm": 3.46875, + "learning_rate": 0.005671108510906165, + "loss": 2.9091, + "mean_token_accuracy": 0.4194348454475403, + "num_tokens": 6718302171.0, + "step": 13141 + }, + { + "epoch": 3.5538128718226067, + "grad_norm": 2.453125, + "learning_rate": 0.0056698381548881205, + "loss": 2.8853, + "mean_token_accuracy": 0.4224761724472046, + "num_tokens": 6718826435.0, + "step": 13142 + }, + { + "epoch": 3.5540832882639264, + "grad_norm": 3.296875, + "learning_rate": 0.005668567962415117, + "loss": 2.936, + "mean_token_accuracy": 0.40606120228767395, + "num_tokens": 6719350493.0, + "step": 13143 + }, + { + "epoch": 3.554353704705246, + "grad_norm": 3.0, + "learning_rate": 0.005667297933526125, + "loss": 2.9458, + "mean_token_accuracy": 0.4343680739402771, + "num_tokens": 6719874684.0, + "step": 13144 + }, + { + "epoch": 3.5546241211465657, + "grad_norm": 3.046875, + "learning_rate": 0.005666028068260114, + "loss": 2.8474, + "mean_token_accuracy": 0.4250418543815613, + "num_tokens": 6720341688.0, + "step": 13145 + }, + { + "epoch": 3.5548945375878853, + "grad_norm": 2.53125, + "learning_rate": 0.00566475836665605, + "loss": 2.6321, + "mean_token_accuracy": 0.46360573172569275, + "num_tokens": 6720860299.0, + "step": 13146 + }, + { + "epoch": 3.555164954029205, + "grad_norm": 2.5, + "learning_rate": 0.005663488828752887, + "loss": 2.9399, + "mean_token_accuracy": 0.43061476945877075, + "num_tokens": 6721330868.0, + "step": 13147 + }, + { + "epoch": 3.5554353704705246, + "grad_norm": 2.359375, + "learning_rate": 0.005662219454589584, + "loss": 2.8644, + "mean_token_accuracy": 0.4474291503429413, + "num_tokens": 6721832037.0, + "step": 13148 + }, + { + "epoch": 3.5557057869118442, + "grad_norm": 3.125, + "learning_rate": 0.0056609502442050855, + "loss": 3.0902, + "mean_token_accuracy": 0.3925178647041321, + "num_tokens": 6722356205.0, + "step": 13149 + }, + { + "epoch": 3.555976203353164, + "grad_norm": 2.78125, + "learning_rate": 0.005659681197638331, + "loss": 3.0182, + "mean_token_accuracy": 0.4222612679004669, + "num_tokens": 6722815693.0, + "step": 13150 + }, + { + "epoch": 3.5562466197944835, + "grad_norm": 15.6875, + "learning_rate": 0.005658412314928265, + "loss": 9.2963, + "mean_token_accuracy": 0.004259855952113867, + "num_tokens": 6723302014.0, + "step": 13151 + }, + { + "epoch": 3.556517036235803, + "grad_norm": 14.375, + "learning_rate": 0.0056571435961138205, + "loss": 3.1962, + "mean_token_accuracy": 0.38477498292922974, + "num_tokens": 6723826075.0, + "step": 13152 + }, + { + "epoch": 3.556787452677123, + "grad_norm": 3.4375, + "learning_rate": 0.005655875041233916, + "loss": 3.0075, + "mean_token_accuracy": 0.42097413539886475, + "num_tokens": 6724350347.0, + "step": 13153 + }, + { + "epoch": 3.5570578691184425, + "grad_norm": 2.609375, + "learning_rate": 0.005654606650327486, + "loss": 2.9625, + "mean_token_accuracy": 0.40669673681259155, + "num_tokens": 6724854171.0, + "step": 13154 + }, + { + "epoch": 3.557328285559762, + "grad_norm": 3.140625, + "learning_rate": 0.005653338423433444, + "loss": 2.8862, + "mean_token_accuracy": 0.4029477834701538, + "num_tokens": 6725378393.0, + "step": 13155 + }, + { + "epoch": 3.5575987020010817, + "grad_norm": 3.640625, + "learning_rate": 0.0056520703605907, + "loss": 2.7097, + "mean_token_accuracy": 0.4229433536529541, + "num_tokens": 6725855486.0, + "step": 13156 + }, + { + "epoch": 3.5578691184424014, + "grad_norm": 3.015625, + "learning_rate": 0.005650802461838166, + "loss": 3.0172, + "mean_token_accuracy": 0.4253556728363037, + "num_tokens": 6726323204.0, + "step": 13157 + }, + { + "epoch": 3.558139534883721, + "grad_norm": 3.53125, + "learning_rate": 0.00564953472721474, + "loss": 2.9215, + "mean_token_accuracy": 0.42785966396331787, + "num_tokens": 6726787810.0, + "step": 13158 + }, + { + "epoch": 3.5584099513250407, + "grad_norm": 3.140625, + "learning_rate": 0.005648267156759329, + "loss": 2.8767, + "mean_token_accuracy": 0.4297674298286438, + "num_tokens": 6727312066.0, + "step": 13159 + }, + { + "epoch": 3.5586803677663603, + "grad_norm": 3.140625, + "learning_rate": 0.005646999750510815, + "loss": 2.8601, + "mean_token_accuracy": 0.4344920217990875, + "num_tokens": 6727792849.0, + "step": 13160 + }, + { + "epoch": 3.55895078420768, + "grad_norm": 3.453125, + "learning_rate": 0.005645732508508095, + "loss": 3.0361, + "mean_token_accuracy": 0.4132933020591736, + "num_tokens": 6728316939.0, + "step": 13161 + }, + { + "epoch": 3.5592212006489996, + "grad_norm": 3.078125, + "learning_rate": 0.005644465430790045, + "loss": 3.0035, + "mean_token_accuracy": 0.43459588289260864, + "num_tokens": 6728759344.0, + "step": 13162 + }, + { + "epoch": 3.5594916170903192, + "grad_norm": 3.0, + "learning_rate": 0.005643198517395546, + "loss": 2.7835, + "mean_token_accuracy": 0.436079204082489, + "num_tokens": 6729254642.0, + "step": 13163 + }, + { + "epoch": 3.559762033531639, + "grad_norm": 3.265625, + "learning_rate": 0.0056419317683634685, + "loss": 3.101, + "mean_token_accuracy": 0.4046209454536438, + "num_tokens": 6729778917.0, + "step": 13164 + }, + { + "epoch": 3.560032449972958, + "grad_norm": 3.765625, + "learning_rate": 0.005640665183732683, + "loss": 2.9872, + "mean_token_accuracy": 0.4253542423248291, + "num_tokens": 6730303108.0, + "step": 13165 + }, + { + "epoch": 3.560302866414278, + "grad_norm": 3.359375, + "learning_rate": 0.005639398763542052, + "loss": 3.0407, + "mean_token_accuracy": 0.39883869886398315, + "num_tokens": 6730827387.0, + "step": 13166 + }, + { + "epoch": 3.5605732828555974, + "grad_norm": 3.0, + "learning_rate": 0.005638132507830427, + "loss": 3.0029, + "mean_token_accuracy": 0.40576955676078796, + "num_tokens": 6731351519.0, + "step": 13167 + }, + { + "epoch": 3.5608436992969175, + "grad_norm": 2.890625, + "learning_rate": 0.005636866416636669, + "loss": 2.8934, + "mean_token_accuracy": 0.41790440678596497, + "num_tokens": 6731875595.0, + "step": 13168 + }, + { + "epoch": 3.5611141157382367, + "grad_norm": 2.875, + "learning_rate": 0.005635600489999622, + "loss": 2.8785, + "mean_token_accuracy": 0.4284261465072632, + "num_tokens": 6732369542.0, + "step": 13169 + }, + { + "epoch": 3.5613845321795568, + "grad_norm": 3.390625, + "learning_rate": 0.005634334727958122, + "loss": 2.7391, + "mean_token_accuracy": 0.4313161373138428, + "num_tokens": 6732870966.0, + "step": 13170 + }, + { + "epoch": 3.561654948620876, + "grad_norm": 15.4375, + "learning_rate": 0.005633069130551016, + "loss": 9.6491, + "mean_token_accuracy": 0.002467217855155468, + "num_tokens": 6733395232.0, + "step": 13171 + }, + { + "epoch": 3.561925365062196, + "grad_norm": 6.8125, + "learning_rate": 0.0056318036978171325, + "loss": 3.1597, + "mean_token_accuracy": 0.400384783744812, + "num_tokens": 6733904287.0, + "step": 13172 + }, + { + "epoch": 3.5621957815035152, + "grad_norm": 2.46875, + "learning_rate": 0.005630538429795297, + "loss": 2.9146, + "mean_token_accuracy": 0.39844810962677, + "num_tokens": 6734428460.0, + "step": 13173 + }, + { + "epoch": 3.5624661979448353, + "grad_norm": 2.578125, + "learning_rate": 0.005629273326524332, + "loss": 2.7653, + "mean_token_accuracy": 0.4410897195339203, + "num_tokens": 6734952736.0, + "step": 13174 + }, + { + "epoch": 3.5627366143861545, + "grad_norm": 2.796875, + "learning_rate": 0.005628008388043058, + "loss": 2.7762, + "mean_token_accuracy": 0.43165016174316406, + "num_tokens": 6735476993.0, + "step": 13175 + }, + { + "epoch": 3.563007030827474, + "grad_norm": 4.09375, + "learning_rate": 0.005626743614390282, + "loss": 2.8019, + "mean_token_accuracy": 0.4769827127456665, + "num_tokens": 6735936848.0, + "step": 13176 + }, + { + "epoch": 3.563277447268794, + "grad_norm": 2.328125, + "learning_rate": 0.005625479005604815, + "loss": 2.9063, + "mean_token_accuracy": 0.41395264863967896, + "num_tokens": 6736461090.0, + "step": 13177 + }, + { + "epoch": 3.5635478637101135, + "grad_norm": 2.71875, + "learning_rate": 0.00562421456172546, + "loss": 3.0058, + "mean_token_accuracy": 0.43706944584846497, + "num_tokens": 6736924891.0, + "step": 13178 + }, + { + "epoch": 3.563818280151433, + "grad_norm": 2.78125, + "learning_rate": 0.005622950282791007, + "loss": 2.912, + "mean_token_accuracy": 0.41387486457824707, + "num_tokens": 6737449124.0, + "step": 13179 + }, + { + "epoch": 3.5640886965927527, + "grad_norm": 3.234375, + "learning_rate": 0.005621686168840255, + "loss": 3.1474, + "mean_token_accuracy": 0.4044222831726074, + "num_tokens": 6737973409.0, + "step": 13180 + }, + { + "epoch": 3.5643591130340724, + "grad_norm": 3.125, + "learning_rate": 0.00562042221991199, + "loss": 2.8569, + "mean_token_accuracy": 0.44657862186431885, + "num_tokens": 6738497544.0, + "step": 13181 + }, + { + "epoch": 3.564629529475392, + "grad_norm": 3.140625, + "learning_rate": 0.0056191584360449865, + "loss": 3.0146, + "mean_token_accuracy": 0.4289293885231018, + "num_tokens": 6739021688.0, + "step": 13182 + }, + { + "epoch": 3.5648999459167117, + "grad_norm": 3.25, + "learning_rate": 0.005617894817278028, + "loss": 2.8851, + "mean_token_accuracy": 0.3987487256526947, + "num_tokens": 6739545863.0, + "step": 13183 + }, + { + "epoch": 3.5651703623580313, + "grad_norm": 2.46875, + "learning_rate": 0.0056166313636498875, + "loss": 2.7563, + "mean_token_accuracy": 0.44386371970176697, + "num_tokens": 6740070092.0, + "step": 13184 + }, + { + "epoch": 3.565440778799351, + "grad_norm": 3.03125, + "learning_rate": 0.005615368075199325, + "loss": 2.8169, + "mean_token_accuracy": 0.45484626293182373, + "num_tokens": 6740534165.0, + "step": 13185 + }, + { + "epoch": 3.5657111952406706, + "grad_norm": 2.5625, + "learning_rate": 0.0056141049519651065, + "loss": 2.8413, + "mean_token_accuracy": 0.43238407373428345, + "num_tokens": 6741058316.0, + "step": 13186 + }, + { + "epoch": 3.5659816116819902, + "grad_norm": 2.734375, + "learning_rate": 0.005612841993985984, + "loss": 2.8494, + "mean_token_accuracy": 0.398515522480011, + "num_tokens": 6741582500.0, + "step": 13187 + }, + { + "epoch": 3.56625202812331, + "grad_norm": 2.9375, + "learning_rate": 0.0056115792013007116, + "loss": 2.8897, + "mean_token_accuracy": 0.4339958727359772, + "num_tokens": 6742106668.0, + "step": 13188 + }, + { + "epoch": 3.5665224445646295, + "grad_norm": 3.515625, + "learning_rate": 0.005610316573948039, + "loss": 3.0218, + "mean_token_accuracy": 0.40647226572036743, + "num_tokens": 6742630876.0, + "step": 13189 + }, + { + "epoch": 3.566792861005949, + "grad_norm": 2.953125, + "learning_rate": 0.0056090541119667, + "loss": 2.8261, + "mean_token_accuracy": 0.4345560073852539, + "num_tokens": 6743143978.0, + "step": 13190 + }, + { + "epoch": 3.567063277447269, + "grad_norm": 14.9375, + "learning_rate": 0.0056077918153954395, + "loss": 9.1432, + "mean_token_accuracy": 0.038957882672548294, + "num_tokens": 6743643086.0, + "step": 13191 + }, + { + "epoch": 3.5673336938885885, + "grad_norm": 6.375, + "learning_rate": 0.005606529684272984, + "loss": 3.073, + "mean_token_accuracy": 0.4002302885055542, + "num_tokens": 6744167305.0, + "step": 13192 + }, + { + "epoch": 3.567604110329908, + "grad_norm": 2.28125, + "learning_rate": 0.005605267718638054, + "loss": 2.8613, + "mean_token_accuracy": 0.43473324179649353, + "num_tokens": 6744691584.0, + "step": 13193 + }, + { + "epoch": 3.5678745267712277, + "grad_norm": 3.21875, + "learning_rate": 0.00560400591852938, + "loss": 2.8484, + "mean_token_accuracy": 0.42558425664901733, + "num_tokens": 6745215763.0, + "step": 13194 + }, + { + "epoch": 3.5681449432125474, + "grad_norm": 4.03125, + "learning_rate": 0.00560274428398567, + "loss": 3.0476, + "mean_token_accuracy": 0.4441075325012207, + "num_tokens": 6745659734.0, + "step": 13195 + }, + { + "epoch": 3.568415359653867, + "grad_norm": 2.921875, + "learning_rate": 0.005601482815045637, + "loss": 2.8153, + "mean_token_accuracy": 0.44144541025161743, + "num_tokens": 6746183973.0, + "step": 13196 + }, + { + "epoch": 3.5686857760951867, + "grad_norm": 3.125, + "learning_rate": 0.005600221511747992, + "loss": 2.9768, + "mean_token_accuracy": 0.40351518988609314, + "num_tokens": 6746649875.0, + "step": 13197 + }, + { + "epoch": 3.5689561925365063, + "grad_norm": 2.203125, + "learning_rate": 0.005598960374131426, + "loss": 2.7217, + "mean_token_accuracy": 0.4209025502204895, + "num_tokens": 6747174128.0, + "step": 13198 + }, + { + "epoch": 3.569226608977826, + "grad_norm": 2.21875, + "learning_rate": 0.005597699402234639, + "loss": 2.9406, + "mean_token_accuracy": 0.4199974238872528, + "num_tokens": 6747681686.0, + "step": 13199 + }, + { + "epoch": 3.5694970254191456, + "grad_norm": 2.484375, + "learning_rate": 0.005596438596096325, + "loss": 3.066, + "mean_token_accuracy": 0.4234379529953003, + "num_tokens": 6748205673.0, + "step": 13200 + }, + { + "epoch": 3.5697674418604652, + "grad_norm": 3.25, + "learning_rate": 0.0055951779557551614, + "loss": 2.8552, + "mean_token_accuracy": 0.4268745183944702, + "num_tokens": 6748729808.0, + "step": 13201 + }, + { + "epoch": 3.570037858301785, + "grad_norm": 2.953125, + "learning_rate": 0.005593917481249836, + "loss": 2.8, + "mean_token_accuracy": 0.43574029207229614, + "num_tokens": 6749228134.0, + "step": 13202 + }, + { + "epoch": 3.5703082747431045, + "grad_norm": 2.75, + "learning_rate": 0.005592657172619018, + "loss": 2.8576, + "mean_token_accuracy": 0.44437944889068604, + "num_tokens": 6749678260.0, + "step": 13203 + }, + { + "epoch": 3.570578691184424, + "grad_norm": 2.953125, + "learning_rate": 0.005591397029901384, + "loss": 2.9221, + "mean_token_accuracy": 0.43566200137138367, + "num_tokens": 6750173285.0, + "step": 13204 + }, + { + "epoch": 3.570849107625744, + "grad_norm": 2.8125, + "learning_rate": 0.005590137053135589, + "loss": 2.8779, + "mean_token_accuracy": 0.4347372055053711, + "num_tokens": 6750697559.0, + "step": 13205 + }, + { + "epoch": 3.571119524067063, + "grad_norm": 3.265625, + "learning_rate": 0.005588877242360303, + "loss": 2.9453, + "mean_token_accuracy": 0.41919979453086853, + "num_tokens": 6751221797.0, + "step": 13206 + }, + { + "epoch": 3.571389940508383, + "grad_norm": 3.828125, + "learning_rate": 0.0055876175976141765, + "loss": 2.5922, + "mean_token_accuracy": 0.46915650367736816, + "num_tokens": 6751745978.0, + "step": 13207 + }, + { + "epoch": 3.5716603569497023, + "grad_norm": 2.71875, + "learning_rate": 0.0055863581189358556, + "loss": 2.7913, + "mean_token_accuracy": 0.4202027916908264, + "num_tokens": 6752270167.0, + "step": 13208 + }, + { + "epoch": 3.5719307733910224, + "grad_norm": 2.6875, + "learning_rate": 0.0055850988063639905, + "loss": 2.888, + "mean_token_accuracy": 0.43818873167037964, + "num_tokens": 6752777662.0, + "step": 13209 + }, + { + "epoch": 3.5722011898323416, + "grad_norm": 3.5, + "learning_rate": 0.005583839659937216, + "loss": 2.9001, + "mean_token_accuracy": 0.42598962783813477, + "num_tokens": 6753299592.0, + "step": 13210 + }, + { + "epoch": 3.5724716062736617, + "grad_norm": 10.625, + "learning_rate": 0.005582580679694172, + "loss": 9.1566, + "mean_token_accuracy": 0.013633526861667633, + "num_tokens": 6753823865.0, + "step": 13211 + }, + { + "epoch": 3.572742022714981, + "grad_norm": 8.5, + "learning_rate": 0.005581321865673484, + "loss": 3.0872, + "mean_token_accuracy": 0.38391897082328796, + "num_tokens": 6754348035.0, + "step": 13212 + }, + { + "epoch": 3.573012439156301, + "grad_norm": 3.234375, + "learning_rate": 0.005580063217913773, + "loss": 2.6194, + "mean_token_accuracy": 0.44759851694107056, + "num_tokens": 6754821019.0, + "step": 13213 + }, + { + "epoch": 3.57328285559762, + "grad_norm": 2.6875, + "learning_rate": 0.005578804736453664, + "loss": 2.9306, + "mean_token_accuracy": 0.4280150234699249, + "num_tokens": 6755345177.0, + "step": 13214 + }, + { + "epoch": 3.5735532720389402, + "grad_norm": 3.03125, + "learning_rate": 0.005577546421331767, + "loss": 2.8289, + "mean_token_accuracy": 0.42605215311050415, + "num_tokens": 6755869235.0, + "step": 13215 + }, + { + "epoch": 3.5738236884802594, + "grad_norm": 2.75, + "learning_rate": 0.005576288272586694, + "loss": 2.9449, + "mean_token_accuracy": 0.4194467067718506, + "num_tokens": 6756393332.0, + "step": 13216 + }, + { + "epoch": 3.574094104921579, + "grad_norm": 3.59375, + "learning_rate": 0.005575030290257044, + "loss": 3.0053, + "mean_token_accuracy": 0.4237444996833801, + "num_tokens": 6756917574.0, + "step": 13217 + }, + { + "epoch": 3.5743645213628987, + "grad_norm": 2.671875, + "learning_rate": 0.005573772474381421, + "loss": 2.8865, + "mean_token_accuracy": 0.4274606704711914, + "num_tokens": 6757441839.0, + "step": 13218 + }, + { + "epoch": 3.5746349378042184, + "grad_norm": 3.015625, + "learning_rate": 0.005572514824998414, + "loss": 2.7511, + "mean_token_accuracy": 0.43174871802330017, + "num_tokens": 6757966061.0, + "step": 13219 + }, + { + "epoch": 3.574905354245538, + "grad_norm": 2.390625, + "learning_rate": 0.005571257342146616, + "loss": 2.7813, + "mean_token_accuracy": 0.4613919258117676, + "num_tokens": 6758382786.0, + "step": 13220 + }, + { + "epoch": 3.5751757706868577, + "grad_norm": 2.71875, + "learning_rate": 0.005570000025864608, + "loss": 2.8571, + "mean_token_accuracy": 0.4410977363586426, + "num_tokens": 6758906980.0, + "step": 13221 + }, + { + "epoch": 3.5754461871281773, + "grad_norm": 4.5625, + "learning_rate": 0.005568742876190964, + "loss": 3.0164, + "mean_token_accuracy": 0.43716832995414734, + "num_tokens": 6759431232.0, + "step": 13222 + }, + { + "epoch": 3.575716603569497, + "grad_norm": 4.75, + "learning_rate": 0.005567485893164264, + "loss": 2.6801, + "mean_token_accuracy": 0.4671573042869568, + "num_tokens": 6759873377.0, + "step": 13223 + }, + { + "epoch": 3.5759870200108166, + "grad_norm": 2.28125, + "learning_rate": 0.005566229076823075, + "loss": 3.0051, + "mean_token_accuracy": 0.42774319648742676, + "num_tokens": 6760394997.0, + "step": 13224 + }, + { + "epoch": 3.5762574364521362, + "grad_norm": 4.28125, + "learning_rate": 0.005564972427205951, + "loss": 2.9665, + "mean_token_accuracy": 0.4252588450908661, + "num_tokens": 6760919167.0, + "step": 13225 + }, + { + "epoch": 3.576527852893456, + "grad_norm": 2.3125, + "learning_rate": 0.005563715944351462, + "loss": 2.8763, + "mean_token_accuracy": 0.44825899600982666, + "num_tokens": 6761387807.0, + "step": 13226 + }, + { + "epoch": 3.5767982693347755, + "grad_norm": 3.078125, + "learning_rate": 0.005562459628298154, + "loss": 2.727, + "mean_token_accuracy": 0.4558054804801941, + "num_tokens": 6761911843.0, + "step": 13227 + }, + { + "epoch": 3.577068685776095, + "grad_norm": 2.59375, + "learning_rate": 0.005561203479084571, + "loss": 2.978, + "mean_token_accuracy": 0.42645063996315, + "num_tokens": 6762435975.0, + "step": 13228 + }, + { + "epoch": 3.577339102217415, + "grad_norm": 3.15625, + "learning_rate": 0.005559947496749265, + "loss": 3.0647, + "mean_token_accuracy": 0.4061698913574219, + "num_tokens": 6762921572.0, + "step": 13229 + }, + { + "epoch": 3.5776095186587344, + "grad_norm": 2.71875, + "learning_rate": 0.005558691681330763, + "loss": 2.8359, + "mean_token_accuracy": 0.45775479078292847, + "num_tokens": 6763382287.0, + "step": 13230 + }, + { + "epoch": 3.577879935100054, + "grad_norm": 250.0, + "learning_rate": 0.0055574360328676025, + "loss": 15.6672, + "mean_token_accuracy": 0.0050728945061564445, + "num_tokens": 6763881152.0, + "step": 13231 + }, + { + "epoch": 3.5781503515413737, + "grad_norm": 6.34375, + "learning_rate": 0.005556180551398314, + "loss": 2.8934, + "mean_token_accuracy": 0.4094759523868561, + "num_tokens": 6764405407.0, + "step": 13232 + }, + { + "epoch": 3.5784207679826934, + "grad_norm": 1.9140625, + "learning_rate": 0.005554925236961412, + "loss": 2.9097, + "mean_token_accuracy": 0.431037575006485, + "num_tokens": 6764886155.0, + "step": 13233 + }, + { + "epoch": 3.578691184424013, + "grad_norm": 2.890625, + "learning_rate": 0.00555367008959542, + "loss": 2.9419, + "mean_token_accuracy": 0.4376416802406311, + "num_tokens": 6765373111.0, + "step": 13234 + }, + { + "epoch": 3.5789616008653327, + "grad_norm": 3.0625, + "learning_rate": 0.005552415109338845, + "loss": 3.0145, + "mean_token_accuracy": 0.40959227085113525, + "num_tokens": 6765897240.0, + "step": 13235 + }, + { + "epoch": 3.5792320173066523, + "grad_norm": 2.9375, + "learning_rate": 0.0055511602962301934, + "loss": 2.8532, + "mean_token_accuracy": 0.43034446239471436, + "num_tokens": 6766386662.0, + "step": 13236 + }, + { + "epoch": 3.579502433747972, + "grad_norm": 2.8125, + "learning_rate": 0.005549905650307971, + "loss": 2.9209, + "mean_token_accuracy": 0.427875280380249, + "num_tokens": 6766910747.0, + "step": 13237 + }, + { + "epoch": 3.5797728501892916, + "grad_norm": 2.65625, + "learning_rate": 0.005548651171610669, + "loss": 2.8938, + "mean_token_accuracy": 0.4101617932319641, + "num_tokens": 6767434975.0, + "step": 13238 + }, + { + "epoch": 3.5800432666306112, + "grad_norm": 3.0625, + "learning_rate": 0.005547396860176779, + "loss": 3.0284, + "mean_token_accuracy": 0.4249108135700226, + "num_tokens": 6767909045.0, + "step": 13239 + }, + { + "epoch": 3.580313683071931, + "grad_norm": 3.578125, + "learning_rate": 0.005546142716044791, + "loss": 3.0442, + "mean_token_accuracy": 0.42910635471343994, + "num_tokens": 6768372320.0, + "step": 13240 + }, + { + "epoch": 3.5805840995132505, + "grad_norm": 3.21875, + "learning_rate": 0.0055448887392531825, + "loss": 3.0073, + "mean_token_accuracy": 0.4322143793106079, + "num_tokens": 6768877426.0, + "step": 13241 + }, + { + "epoch": 3.58085451595457, + "grad_norm": 2.5625, + "learning_rate": 0.005543634929840427, + "loss": 2.7947, + "mean_token_accuracy": 0.428586483001709, + "num_tokens": 6769401553.0, + "step": 13242 + }, + { + "epoch": 3.58112493239589, + "grad_norm": 2.8125, + "learning_rate": 0.005542381287845002, + "loss": 3.089, + "mean_token_accuracy": 0.4153016209602356, + "num_tokens": 6769874068.0, + "step": 13243 + }, + { + "epoch": 3.5813953488372094, + "grad_norm": 3.359375, + "learning_rate": 0.005541127813305365, + "loss": 2.9677, + "mean_token_accuracy": 0.43572744727134705, + "num_tokens": 6770329061.0, + "step": 13244 + }, + { + "epoch": 3.581665765278529, + "grad_norm": 3.125, + "learning_rate": 0.005539874506259981, + "loss": 2.8959, + "mean_token_accuracy": 0.4209393560886383, + "num_tokens": 6770853295.0, + "step": 13245 + }, + { + "epoch": 3.5819361817198487, + "grad_norm": 3.28125, + "learning_rate": 0.005538621366747306, + "loss": 2.8546, + "mean_token_accuracy": 0.43521082401275635, + "num_tokens": 6771377476.0, + "step": 13246 + }, + { + "epoch": 3.582206598161168, + "grad_norm": 4.0, + "learning_rate": 0.0055373683948057905, + "loss": 2.7896, + "mean_token_accuracy": 0.44404739141464233, + "num_tokens": 6771870884.0, + "step": 13247 + }, + { + "epoch": 3.582477014602488, + "grad_norm": 3.828125, + "learning_rate": 0.005536115590473872, + "loss": 3.0455, + "mean_token_accuracy": 0.42791807651519775, + "num_tokens": 6772395158.0, + "step": 13248 + }, + { + "epoch": 3.582747431043807, + "grad_norm": 4.25, + "learning_rate": 0.005534862953790001, + "loss": 2.9915, + "mean_token_accuracy": 0.4318235516548157, + "num_tokens": 6772858783.0, + "step": 13249 + }, + { + "epoch": 3.5830178474851273, + "grad_norm": 3.015625, + "learning_rate": 0.005533610484792604, + "loss": 2.8638, + "mean_token_accuracy": 0.44403982162475586, + "num_tokens": 6773382980.0, + "step": 13250 + }, + { + "epoch": 3.5832882639264465, + "grad_norm": 61.5, + "learning_rate": 0.00553235818352011, + "loss": 9.791, + "mean_token_accuracy": 0.031436141580343246, + "num_tokens": 6773843183.0, + "step": 13251 + }, + { + "epoch": 3.5835586803677666, + "grad_norm": 6.65625, + "learning_rate": 0.00553110605001095, + "loss": 3.1266, + "mean_token_accuracy": 0.391824334859848, + "num_tokens": 6774367408.0, + "step": 13252 + }, + { + "epoch": 3.583829096809086, + "grad_norm": 2.15625, + "learning_rate": 0.0055298540843035406, + "loss": 2.8656, + "mean_token_accuracy": 0.4316767454147339, + "num_tokens": 6774853374.0, + "step": 13253 + }, + { + "epoch": 3.584099513250406, + "grad_norm": 3.140625, + "learning_rate": 0.005528602286436289, + "loss": 2.9498, + "mean_token_accuracy": 0.42774972319602966, + "num_tokens": 6775377653.0, + "step": 13254 + }, + { + "epoch": 3.584369929691725, + "grad_norm": 3.625, + "learning_rate": 0.005527350656447614, + "loss": 2.8271, + "mean_token_accuracy": 0.4169486165046692, + "num_tokens": 6775901797.0, + "step": 13255 + }, + { + "epoch": 3.584640346133045, + "grad_norm": 2.5, + "learning_rate": 0.005526099194375915, + "loss": 2.7722, + "mean_token_accuracy": 0.4435656666755676, + "num_tokens": 6776425747.0, + "step": 13256 + }, + { + "epoch": 3.5849107625743644, + "grad_norm": 3.34375, + "learning_rate": 0.005524847900259588, + "loss": 3.0153, + "mean_token_accuracy": 0.4153073728084564, + "num_tokens": 6776949874.0, + "step": 13257 + }, + { + "epoch": 3.585181179015684, + "grad_norm": 14.625, + "learning_rate": 0.0055235967741370275, + "loss": 2.6938, + "mean_token_accuracy": 0.4790923595428467, + "num_tokens": 6777474136.0, + "step": 13258 + }, + { + "epoch": 3.5854515954570036, + "grad_norm": 3.40625, + "learning_rate": 0.005522345816046627, + "loss": 2.855, + "mean_token_accuracy": 0.44809842109680176, + "num_tokens": 6777998356.0, + "step": 13259 + }, + { + "epoch": 3.5857220118983233, + "grad_norm": 2.703125, + "learning_rate": 0.005521095026026761, + "loss": 2.8412, + "mean_token_accuracy": 0.43040210008621216, + "num_tokens": 6778522637.0, + "step": 13260 + }, + { + "epoch": 3.585992428339643, + "grad_norm": 2.734375, + "learning_rate": 0.005519844404115816, + "loss": 2.7974, + "mean_token_accuracy": 0.4276481568813324, + "num_tokens": 6779046879.0, + "step": 13261 + }, + { + "epoch": 3.5862628447809626, + "grad_norm": 3.046875, + "learning_rate": 0.005518593950352158, + "loss": 2.7011, + "mean_token_accuracy": 0.4404798150062561, + "num_tokens": 6779571033.0, + "step": 13262 + }, + { + "epoch": 3.586533261222282, + "grad_norm": 3.09375, + "learning_rate": 0.005517343664774161, + "loss": 2.7748, + "mean_token_accuracy": 0.4368407428264618, + "num_tokens": 6780095305.0, + "step": 13263 + }, + { + "epoch": 3.586803677663602, + "grad_norm": 2.890625, + "learning_rate": 0.005516093547420183, + "loss": 2.848, + "mean_token_accuracy": 0.41843461990356445, + "num_tokens": 6780619518.0, + "step": 13264 + }, + { + "epoch": 3.5870740941049215, + "grad_norm": 2.953125, + "learning_rate": 0.005514843598328579, + "loss": 2.8922, + "mean_token_accuracy": 0.42561739683151245, + "num_tokens": 6781143792.0, + "step": 13265 + }, + { + "epoch": 3.587344510546241, + "grad_norm": 2.828125, + "learning_rate": 0.0055135938175377075, + "loss": 2.7562, + "mean_token_accuracy": 0.44406378269195557, + "num_tokens": 6781621668.0, + "step": 13266 + }, + { + "epoch": 3.587614926987561, + "grad_norm": 3.53125, + "learning_rate": 0.005512344205085911, + "loss": 2.9745, + "mean_token_accuracy": 0.4264923334121704, + "num_tokens": 6782145863.0, + "step": 13267 + }, + { + "epoch": 3.5878853434288804, + "grad_norm": 3.0625, + "learning_rate": 0.00551109476101153, + "loss": 3.1325, + "mean_token_accuracy": 0.39610564708709717, + "num_tokens": 6782670140.0, + "step": 13268 + }, + { + "epoch": 3.5881557598702, + "grad_norm": 2.75, + "learning_rate": 0.0055098454853529065, + "loss": 2.8492, + "mean_token_accuracy": 0.4264506697654724, + "num_tokens": 6783194370.0, + "step": 13269 + }, + { + "epoch": 3.5884261763115197, + "grad_norm": 2.75, + "learning_rate": 0.00550859637814837, + "loss": 2.976, + "mean_token_accuracy": 0.373128741979599, + "num_tokens": 6783718521.0, + "step": 13270 + }, + { + "epoch": 3.5886965927528394, + "grad_norm": 50.75, + "learning_rate": 0.005507347439436244, + "loss": 10.7916, + "mean_token_accuracy": 0.001581286545842886, + "num_tokens": 6784242685.0, + "step": 13271 + }, + { + "epoch": 3.588967009194159, + "grad_norm": 8.25, + "learning_rate": 0.005506098669254852, + "loss": 3.0538, + "mean_token_accuracy": 0.3933410942554474, + "num_tokens": 6784766901.0, + "step": 13272 + }, + { + "epoch": 3.5892374256354787, + "grad_norm": 2.390625, + "learning_rate": 0.0055048500676425065, + "loss": 2.8739, + "mean_token_accuracy": 0.4311297535896301, + "num_tokens": 6785290915.0, + "step": 13273 + }, + { + "epoch": 3.5895078420767983, + "grad_norm": 2.5625, + "learning_rate": 0.005503601634637522, + "loss": 2.7464, + "mean_token_accuracy": 0.445272296667099, + "num_tokens": 6785802978.0, + "step": 13274 + }, + { + "epoch": 3.589778258518118, + "grad_norm": 2.53125, + "learning_rate": 0.0055023533702782046, + "loss": 2.9516, + "mean_token_accuracy": 0.43392127752304077, + "num_tokens": 6786327258.0, + "step": 13275 + }, + { + "epoch": 3.5900486749594376, + "grad_norm": 3.328125, + "learning_rate": 0.005501105274602855, + "loss": 3.1507, + "mean_token_accuracy": 0.3836236894130707, + "num_tokens": 6786851375.0, + "step": 13276 + }, + { + "epoch": 3.5903190914007572, + "grad_norm": 3.046875, + "learning_rate": 0.0054998573476497635, + "loss": 2.7549, + "mean_token_accuracy": 0.4419924318790436, + "num_tokens": 6787316816.0, + "step": 13277 + }, + { + "epoch": 3.590589507842077, + "grad_norm": 3.265625, + "learning_rate": 0.005498609589457227, + "loss": 3.0811, + "mean_token_accuracy": 0.40813905000686646, + "num_tokens": 6787840987.0, + "step": 13278 + }, + { + "epoch": 3.5908599242833965, + "grad_norm": 4.09375, + "learning_rate": 0.005497362000063526, + "loss": 2.8658, + "mean_token_accuracy": 0.429097443819046, + "num_tokens": 6788365152.0, + "step": 13279 + }, + { + "epoch": 3.591130340724716, + "grad_norm": 2.671875, + "learning_rate": 0.005496114579506937, + "loss": 2.8272, + "mean_token_accuracy": 0.43873196840286255, + "num_tokens": 6788837288.0, + "step": 13280 + }, + { + "epoch": 3.591400757166036, + "grad_norm": 2.8125, + "learning_rate": 0.005494867327825744, + "loss": 3.0643, + "mean_token_accuracy": 0.418032169342041, + "num_tokens": 6789361401.0, + "step": 13281 + }, + { + "epoch": 3.5916711736073554, + "grad_norm": 8.5625, + "learning_rate": 0.005493620245058207, + "loss": 2.8992, + "mean_token_accuracy": 0.4555651545524597, + "num_tokens": 6789819196.0, + "step": 13282 + }, + { + "epoch": 3.591941590048675, + "grad_norm": 2.59375, + "learning_rate": 0.005492373331242595, + "loss": 2.9446, + "mean_token_accuracy": 0.42729926109313965, + "num_tokens": 6790342373.0, + "step": 13283 + }, + { + "epoch": 3.5922120064899947, + "grad_norm": 2.75, + "learning_rate": 0.005491126586417168, + "loss": 3.0808, + "mean_token_accuracy": 0.4126856029033661, + "num_tokens": 6790866648.0, + "step": 13284 + }, + { + "epoch": 3.5924824229313144, + "grad_norm": 3.03125, + "learning_rate": 0.005489880010620174, + "loss": 2.8393, + "mean_token_accuracy": 0.4326324760913849, + "num_tokens": 6791390912.0, + "step": 13285 + }, + { + "epoch": 3.592752839372634, + "grad_norm": 2.734375, + "learning_rate": 0.00548863360388987, + "loss": 2.7275, + "mean_token_accuracy": 0.42156535387039185, + "num_tokens": 6791915102.0, + "step": 13286 + }, + { + "epoch": 3.5930232558139537, + "grad_norm": 3.21875, + "learning_rate": 0.005487387366264492, + "loss": 2.9052, + "mean_token_accuracy": 0.4206153154373169, + "num_tokens": 6792439354.0, + "step": 13287 + }, + { + "epoch": 3.593293672255273, + "grad_norm": 3.28125, + "learning_rate": 0.005486141297782279, + "loss": 3.0634, + "mean_token_accuracy": 0.391568124294281, + "num_tokens": 6792963579.0, + "step": 13288 + }, + { + "epoch": 3.593564088696593, + "grad_norm": 3.25, + "learning_rate": 0.00548489539848147, + "loss": 2.9614, + "mean_token_accuracy": 0.42558175325393677, + "num_tokens": 6793487859.0, + "step": 13289 + }, + { + "epoch": 3.593834505137912, + "grad_norm": 3.375, + "learning_rate": 0.005483649668400289, + "loss": 2.8998, + "mean_token_accuracy": 0.4139452874660492, + "num_tokens": 6794012138.0, + "step": 13290 + }, + { + "epoch": 3.5941049215792322, + "grad_norm": 186.0, + "learning_rate": 0.005482404107576956, + "loss": 14.5898, + "mean_token_accuracy": 0.007601055316627026, + "num_tokens": 6794536341.0, + "step": 13291 + }, + { + "epoch": 3.5943753380205514, + "grad_norm": 5.125, + "learning_rate": 0.005481158716049694, + "loss": 3.0246, + "mean_token_accuracy": 0.4155161380767822, + "num_tokens": 6795024324.0, + "step": 13292 + }, + { + "epoch": 3.5946457544618715, + "grad_norm": 1.8671875, + "learning_rate": 0.005479913493856713, + "loss": 2.9992, + "mean_token_accuracy": 0.39391517639160156, + "num_tokens": 6795548598.0, + "step": 13293 + }, + { + "epoch": 3.5949161709031907, + "grad_norm": 2.96875, + "learning_rate": 0.005478668441036216, + "loss": 2.9671, + "mean_token_accuracy": 0.41698890924453735, + "num_tokens": 6796072877.0, + "step": 13294 + }, + { + "epoch": 3.595186587344511, + "grad_norm": 2.703125, + "learning_rate": 0.005477423557626413, + "loss": 2.9839, + "mean_token_accuracy": 0.4307825565338135, + "num_tokens": 6796597085.0, + "step": 13295 + }, + { + "epoch": 3.59545700378583, + "grad_norm": 4.78125, + "learning_rate": 0.005476178843665493, + "loss": 3.0391, + "mean_token_accuracy": 0.41966670751571655, + "num_tokens": 6797083742.0, + "step": 13296 + }, + { + "epoch": 3.59572742022715, + "grad_norm": 2.8125, + "learning_rate": 0.00547493429919165, + "loss": 2.9797, + "mean_token_accuracy": 0.40859365463256836, + "num_tokens": 6797607985.0, + "step": 13297 + }, + { + "epoch": 3.5959978366684693, + "grad_norm": 3.25, + "learning_rate": 0.005473689924243074, + "loss": 2.9307, + "mean_token_accuracy": 0.44051313400268555, + "num_tokens": 6798080065.0, + "step": 13298 + }, + { + "epoch": 3.596268253109789, + "grad_norm": 2.6875, + "learning_rate": 0.005472445718857941, + "loss": 2.983, + "mean_token_accuracy": 0.408836305141449, + "num_tokens": 6798604263.0, + "step": 13299 + }, + { + "epoch": 3.5965386695511086, + "grad_norm": 3.171875, + "learning_rate": 0.005471201683074427, + "loss": 2.9781, + "mean_token_accuracy": 0.4249599874019623, + "num_tokens": 6799128539.0, + "step": 13300 + }, + { + "epoch": 3.596809085992428, + "grad_norm": 3.0, + "learning_rate": 0.005469957816930704, + "loss": 3.0032, + "mean_token_accuracy": 0.42597347497940063, + "num_tokens": 6799622770.0, + "step": 13301 + }, + { + "epoch": 3.597079502433748, + "grad_norm": 3.0625, + "learning_rate": 0.005468714120464941, + "loss": 2.8876, + "mean_token_accuracy": 0.41230058670043945, + "num_tokens": 6800147046.0, + "step": 13302 + }, + { + "epoch": 3.5973499188750675, + "grad_norm": 3.296875, + "learning_rate": 0.005467470593715293, + "loss": 2.8809, + "mean_token_accuracy": 0.42936423420906067, + "num_tokens": 6800630041.0, + "step": 13303 + }, + { + "epoch": 3.597620335316387, + "grad_norm": 3.265625, + "learning_rate": 0.00546622723671992, + "loss": 3.0557, + "mean_token_accuracy": 0.4112910032272339, + "num_tokens": 6801154276.0, + "step": 13304 + }, + { + "epoch": 3.597890751757707, + "grad_norm": 2.59375, + "learning_rate": 0.005464984049516968, + "loss": 2.7627, + "mean_token_accuracy": 0.440065860748291, + "num_tokens": 6801678545.0, + "step": 13305 + }, + { + "epoch": 3.5981611681990264, + "grad_norm": 2.890625, + "learning_rate": 0.005463741032144582, + "loss": 2.8893, + "mean_token_accuracy": 0.41952943801879883, + "num_tokens": 6802202817.0, + "step": 13306 + }, + { + "epoch": 3.598431584640346, + "grad_norm": 3.328125, + "learning_rate": 0.005462498184640905, + "loss": 2.816, + "mean_token_accuracy": 0.4191683530807495, + "num_tokens": 6802726928.0, + "step": 13307 + }, + { + "epoch": 3.5987020010816657, + "grad_norm": 3.171875, + "learning_rate": 0.005461255507044063, + "loss": 2.7338, + "mean_token_accuracy": 0.4330594539642334, + "num_tokens": 6803251201.0, + "step": 13308 + }, + { + "epoch": 3.5989724175229854, + "grad_norm": 3.09375, + "learning_rate": 0.005460012999392197, + "loss": 2.7787, + "mean_token_accuracy": 0.4291120171546936, + "num_tokens": 6803775467.0, + "step": 13309 + }, + { + "epoch": 3.599242833964305, + "grad_norm": 2.84375, + "learning_rate": 0.00545877066172342, + "loss": 2.9339, + "mean_token_accuracy": 0.4368356168270111, + "num_tokens": 6804299687.0, + "step": 13310 + }, + { + "epoch": 3.5995132504056246, + "grad_norm": 14.0, + "learning_rate": 0.0054575284940758554, + "loss": 9.4659, + "mean_token_accuracy": 0.034398652613162994, + "num_tokens": 6804804841.0, + "step": 13311 + }, + { + "epoch": 3.5997836668469443, + "grad_norm": 7.0625, + "learning_rate": 0.005456286496487617, + "loss": 3.1284, + "mean_token_accuracy": 0.39058196544647217, + "num_tokens": 6805328957.0, + "step": 13312 + }, + { + "epoch": 3.600054083288264, + "grad_norm": 2.140625, + "learning_rate": 0.005455044668996813, + "loss": 2.8751, + "mean_token_accuracy": 0.43270307779312134, + "num_tokens": 6805853175.0, + "step": 13313 + }, + { + "epoch": 3.6003244997295836, + "grad_norm": 2.203125, + "learning_rate": 0.00545380301164154, + "loss": 2.9789, + "mean_token_accuracy": 0.4106373190879822, + "num_tokens": 6806377446.0, + "step": 13314 + }, + { + "epoch": 3.600594916170903, + "grad_norm": 27.875, + "learning_rate": 0.0054525615244599025, + "loss": 3.1921, + "mean_token_accuracy": 0.4068603813648224, + "num_tokens": 6806901558.0, + "step": 13315 + }, + { + "epoch": 3.600865332612223, + "grad_norm": 4.90625, + "learning_rate": 0.005451320207489995, + "loss": 3.1325, + "mean_token_accuracy": 0.3933415412902832, + "num_tokens": 6807425821.0, + "step": 13316 + }, + { + "epoch": 3.6011357490535425, + "grad_norm": 3.515625, + "learning_rate": 0.005450079060769898, + "loss": 3.0815, + "mean_token_accuracy": 0.41299551725387573, + "num_tokens": 6807950037.0, + "step": 13317 + }, + { + "epoch": 3.601406165494862, + "grad_norm": 3.5625, + "learning_rate": 0.005448838084337699, + "loss": 3.0753, + "mean_token_accuracy": 0.40293997526168823, + "num_tokens": 6808474214.0, + "step": 13318 + }, + { + "epoch": 3.601676581936182, + "grad_norm": 3.59375, + "learning_rate": 0.005447597278231473, + "loss": 2.9337, + "mean_token_accuracy": 0.42518025636672974, + "num_tokens": 6808998457.0, + "step": 13319 + }, + { + "epoch": 3.6019469983775014, + "grad_norm": 2.734375, + "learning_rate": 0.005446356642489288, + "loss": 2.9682, + "mean_token_accuracy": 0.43294471502304077, + "num_tokens": 6809499069.0, + "step": 13320 + }, + { + "epoch": 3.602217414818821, + "grad_norm": 3.1875, + "learning_rate": 0.005445116177149215, + "loss": 2.9303, + "mean_token_accuracy": 0.4229232668876648, + "num_tokens": 6810023271.0, + "step": 13321 + }, + { + "epoch": 3.6024878312601407, + "grad_norm": 3.109375, + "learning_rate": 0.005443875882249314, + "loss": 2.8536, + "mean_token_accuracy": 0.4039299488067627, + "num_tokens": 6810547418.0, + "step": 13322 + }, + { + "epoch": 3.6027582477014604, + "grad_norm": 2.8125, + "learning_rate": 0.005442635757827639, + "loss": 3.0364, + "mean_token_accuracy": 0.40292319655418396, + "num_tokens": 6811053024.0, + "step": 13323 + }, + { + "epoch": 3.60302866414278, + "grad_norm": 3.65625, + "learning_rate": 0.005441395803922245, + "loss": 3.0112, + "mean_token_accuracy": 0.42038875818252563, + "num_tokens": 6811577282.0, + "step": 13324 + }, + { + "epoch": 3.6032990805840996, + "grad_norm": 2.640625, + "learning_rate": 0.005440156020571173, + "loss": 2.9924, + "mean_token_accuracy": 0.40826815366744995, + "num_tokens": 6812101560.0, + "step": 13325 + }, + { + "epoch": 3.6035694970254193, + "grad_norm": 3.59375, + "learning_rate": 0.005438916407812462, + "loss": 2.9587, + "mean_token_accuracy": 0.42285341024398804, + "num_tokens": 6812563767.0, + "step": 13326 + }, + { + "epoch": 3.603839913466739, + "grad_norm": 35.0, + "learning_rate": 0.005437676965684153, + "loss": 2.7859, + "mean_token_accuracy": 0.4246810972690582, + "num_tokens": 6813087975.0, + "step": 13327 + }, + { + "epoch": 3.6041103299080586, + "grad_norm": 5.40625, + "learning_rate": 0.00543643769422427, + "loss": 2.986, + "mean_token_accuracy": 0.4062548875808716, + "num_tokens": 6813612004.0, + "step": 13328 + }, + { + "epoch": 3.6043807463493778, + "grad_norm": 2.453125, + "learning_rate": 0.005435198593470842, + "loss": 2.9226, + "mean_token_accuracy": 0.430903822183609, + "num_tokens": 6814136192.0, + "step": 13329 + }, + { + "epoch": 3.604651162790698, + "grad_norm": 12.0, + "learning_rate": 0.005433959663461884, + "loss": 2.829, + "mean_token_accuracy": 0.4375588893890381, + "num_tokens": 6814635226.0, + "step": 13330 + }, + { + "epoch": 3.604921579232017, + "grad_norm": 31.625, + "learning_rate": 0.005432720904235411, + "loss": 10.6016, + "mean_token_accuracy": 0.0031821546144783497, + "num_tokens": 6815159399.0, + "step": 13331 + }, + { + "epoch": 3.605191995673337, + "grad_norm": 5.15625, + "learning_rate": 0.005431482315829437, + "loss": 3.107, + "mean_token_accuracy": 0.4129217565059662, + "num_tokens": 6815625756.0, + "step": 13332 + }, + { + "epoch": 3.6054624121146563, + "grad_norm": 2.734375, + "learning_rate": 0.005430243898281961, + "loss": 2.9507, + "mean_token_accuracy": 0.4185561537742615, + "num_tokens": 6816149848.0, + "step": 13333 + }, + { + "epoch": 3.6057328285559764, + "grad_norm": 2.8125, + "learning_rate": 0.005429005651630979, + "loss": 2.7704, + "mean_token_accuracy": 0.4161369502544403, + "num_tokens": 6816674041.0, + "step": 13334 + }, + { + "epoch": 3.6060032449972956, + "grad_norm": 3.0, + "learning_rate": 0.005427767575914491, + "loss": 2.8554, + "mean_token_accuracy": 0.43177467584609985, + "num_tokens": 6817198325.0, + "step": 13335 + }, + { + "epoch": 3.6062736614386157, + "grad_norm": 3.234375, + "learning_rate": 0.005426529671170479, + "loss": 2.8717, + "mean_token_accuracy": 0.4139602482318878, + "num_tokens": 6817722543.0, + "step": 13336 + }, + { + "epoch": 3.606544077879935, + "grad_norm": 3.140625, + "learning_rate": 0.005425291937436925, + "loss": 2.921, + "mean_token_accuracy": 0.4181109368801117, + "num_tokens": 6818246780.0, + "step": 13337 + }, + { + "epoch": 3.606814494321255, + "grad_norm": 3.0625, + "learning_rate": 0.00542405437475181, + "loss": 2.8241, + "mean_token_accuracy": 0.42150798439979553, + "num_tokens": 6818771052.0, + "step": 13338 + }, + { + "epoch": 3.607084910762574, + "grad_norm": 2.9375, + "learning_rate": 0.005422816983153107, + "loss": 3.0317, + "mean_token_accuracy": 0.41112130880355835, + "num_tokens": 6819295234.0, + "step": 13339 + }, + { + "epoch": 3.607355327203894, + "grad_norm": 3.390625, + "learning_rate": 0.005421579762678776, + "loss": 2.9109, + "mean_token_accuracy": 0.42795929312705994, + "num_tokens": 6819819465.0, + "step": 13340 + }, + { + "epoch": 3.6076257436452135, + "grad_norm": 2.796875, + "learning_rate": 0.005420342713366786, + "loss": 2.9273, + "mean_token_accuracy": 0.4354252517223358, + "num_tokens": 6820283308.0, + "step": 13341 + }, + { + "epoch": 3.607896160086533, + "grad_norm": 2.953125, + "learning_rate": 0.00541910583525509, + "loss": 2.9523, + "mean_token_accuracy": 0.4182068705558777, + "num_tokens": 6820807482.0, + "step": 13342 + }, + { + "epoch": 3.6081665765278528, + "grad_norm": 2.75, + "learning_rate": 0.0054178691283816366, + "loss": 2.9756, + "mean_token_accuracy": 0.4175497591495514, + "num_tokens": 6821331657.0, + "step": 13343 + }, + { + "epoch": 3.6084369929691724, + "grad_norm": 3.03125, + "learning_rate": 0.005416632592784374, + "loss": 2.6035, + "mean_token_accuracy": 0.4073694348335266, + "num_tokens": 6821855847.0, + "step": 13344 + }, + { + "epoch": 3.608707409410492, + "grad_norm": 3.21875, + "learning_rate": 0.005415396228501247, + "loss": 2.835, + "mean_token_accuracy": 0.4284263849258423, + "num_tokens": 6822379957.0, + "step": 13345 + }, + { + "epoch": 3.6089778258518117, + "grad_norm": 2.84375, + "learning_rate": 0.005414160035570183, + "loss": 2.9163, + "mean_token_accuracy": 0.42072802782058716, + "num_tokens": 6822904216.0, + "step": 13346 + }, + { + "epoch": 3.6092482422931313, + "grad_norm": 3.125, + "learning_rate": 0.00541292401402912, + "loss": 2.7598, + "mean_token_accuracy": 0.4341669976711273, + "num_tokens": 6823394743.0, + "step": 13347 + }, + { + "epoch": 3.609518658734451, + "grad_norm": 2.703125, + "learning_rate": 0.005411688163915978, + "loss": 2.8482, + "mean_token_accuracy": 0.42467886209487915, + "num_tokens": 6823919008.0, + "step": 13348 + }, + { + "epoch": 3.6097890751757706, + "grad_norm": 3.328125, + "learning_rate": 0.0054104524852686756, + "loss": 3.0331, + "mean_token_accuracy": 0.4172566533088684, + "num_tokens": 6824379614.0, + "step": 13349 + }, + { + "epoch": 3.6100594916170903, + "grad_norm": 2.34375, + "learning_rate": 0.005409216978125129, + "loss": 2.8643, + "mean_token_accuracy": 0.4362061619758606, + "num_tokens": 6824856032.0, + "step": 13350 + }, + { + "epoch": 3.61032990805841, + "grad_norm": 8.1875, + "learning_rate": 0.005407981642523246, + "loss": 8.8899, + "mean_token_accuracy": 0.03001384064555168, + "num_tokens": 6825380291.0, + "step": 13351 + }, + { + "epoch": 3.6106003244997296, + "grad_norm": 8.0, + "learning_rate": 0.005406746478500935, + "loss": 3.0477, + "mean_token_accuracy": 0.41175946593284607, + "num_tokens": 6825806177.0, + "step": 13352 + }, + { + "epoch": 3.610870740941049, + "grad_norm": 2.21875, + "learning_rate": 0.005405511486096089, + "loss": 2.9451, + "mean_token_accuracy": 0.42567041516304016, + "num_tokens": 6826307356.0, + "step": 13353 + }, + { + "epoch": 3.611141157382369, + "grad_norm": 2.390625, + "learning_rate": 0.005404276665346599, + "loss": 2.7226, + "mean_token_accuracy": 0.4431656301021576, + "num_tokens": 6826831493.0, + "step": 13354 + }, + { + "epoch": 3.6114115738236885, + "grad_norm": 3.484375, + "learning_rate": 0.0054030420162903625, + "loss": 3.1328, + "mean_token_accuracy": 0.410065621137619, + "num_tokens": 6827355734.0, + "step": 13355 + }, + { + "epoch": 3.611681990265008, + "grad_norm": 3.296875, + "learning_rate": 0.005401807538965255, + "loss": 2.9209, + "mean_token_accuracy": 0.3989824056625366, + "num_tokens": 6827837495.0, + "step": 13356 + }, + { + "epoch": 3.611952406706328, + "grad_norm": 2.40625, + "learning_rate": 0.005400573233409152, + "loss": 2.9016, + "mean_token_accuracy": 0.4327751100063324, + "num_tokens": 6828361671.0, + "step": 13357 + }, + { + "epoch": 3.6122228231476474, + "grad_norm": 2.96875, + "learning_rate": 0.00539933909965993, + "loss": 2.8775, + "mean_token_accuracy": 0.424007773399353, + "num_tokens": 6828885923.0, + "step": 13358 + }, + { + "epoch": 3.612493239588967, + "grad_norm": 2.84375, + "learning_rate": 0.005398105137755458, + "loss": 2.8746, + "mean_token_accuracy": 0.42414411902427673, + "num_tokens": 6829410152.0, + "step": 13359 + }, + { + "epoch": 3.6127636560302867, + "grad_norm": 2.6875, + "learning_rate": 0.005396871347733592, + "loss": 2.8624, + "mean_token_accuracy": 0.42155200242996216, + "num_tokens": 6829934401.0, + "step": 13360 + }, + { + "epoch": 3.6130340724716064, + "grad_norm": 2.921875, + "learning_rate": 0.005395637729632192, + "loss": 2.9686, + "mean_token_accuracy": 0.4314649701118469, + "num_tokens": 6830458606.0, + "step": 13361 + }, + { + "epoch": 3.613304488912926, + "grad_norm": 2.78125, + "learning_rate": 0.005394404283489109, + "loss": 2.7915, + "mean_token_accuracy": 0.4244382381439209, + "num_tokens": 6830982634.0, + "step": 13362 + }, + { + "epoch": 3.6135749053542456, + "grad_norm": 2.65625, + "learning_rate": 0.005393171009342184, + "loss": 2.9561, + "mean_token_accuracy": 0.41727742552757263, + "num_tokens": 6831478121.0, + "step": 13363 + }, + { + "epoch": 3.6138453217955653, + "grad_norm": 2.84375, + "learning_rate": 0.005391937907229265, + "loss": 2.7935, + "mean_token_accuracy": 0.42823994159698486, + "num_tokens": 6832002281.0, + "step": 13364 + }, + { + "epoch": 3.614115738236885, + "grad_norm": 3.21875, + "learning_rate": 0.005390704977188181, + "loss": 2.8995, + "mean_token_accuracy": 0.4232364296913147, + "num_tokens": 6832526460.0, + "step": 13365 + }, + { + "epoch": 3.6143861546782046, + "grad_norm": 3.375, + "learning_rate": 0.005389472219256764, + "loss": 2.9591, + "mean_token_accuracy": 0.42900004982948303, + "num_tokens": 6833050730.0, + "step": 13366 + }, + { + "epoch": 3.614656571119524, + "grad_norm": 3.453125, + "learning_rate": 0.00538823963347284, + "loss": 2.6553, + "mean_token_accuracy": 0.4369761645793915, + "num_tokens": 6833521113.0, + "step": 13367 + }, + { + "epoch": 3.614926987560844, + "grad_norm": 3.71875, + "learning_rate": 0.005387007219874229, + "loss": 2.7848, + "mean_token_accuracy": 0.44373324513435364, + "num_tokens": 6834007465.0, + "step": 13368 + }, + { + "epoch": 3.6151974040021635, + "grad_norm": 2.765625, + "learning_rate": 0.005385774978498739, + "loss": 2.9206, + "mean_token_accuracy": 0.4285547137260437, + "num_tokens": 6834531634.0, + "step": 13369 + }, + { + "epoch": 3.6154678204434827, + "grad_norm": 3.578125, + "learning_rate": 0.005384542909384184, + "loss": 3.0619, + "mean_token_accuracy": 0.4296500086784363, + "num_tokens": 6835024774.0, + "step": 13370 + }, + { + "epoch": 3.615738236884803, + "grad_norm": 36.0, + "learning_rate": 0.005383311012568366, + "loss": 9.5624, + "mean_token_accuracy": 0.04040181636810303, + "num_tokens": 6835485013.0, + "step": 13371 + }, + { + "epoch": 3.616008653326122, + "grad_norm": 7.5, + "learning_rate": 0.005382079288089087, + "loss": 3.107, + "mean_token_accuracy": 0.41083356738090515, + "num_tokens": 6835964680.0, + "step": 13372 + }, + { + "epoch": 3.616279069767442, + "grad_norm": 2.4375, + "learning_rate": 0.0053808477359841315, + "loss": 2.8126, + "mean_token_accuracy": 0.42536401748657227, + "num_tokens": 6836488860.0, + "step": 13373 + }, + { + "epoch": 3.6165494862087613, + "grad_norm": 3.078125, + "learning_rate": 0.0053796163562912945, + "loss": 3.0324, + "mean_token_accuracy": 0.43198394775390625, + "num_tokens": 6837013111.0, + "step": 13374 + }, + { + "epoch": 3.6168199026500814, + "grad_norm": 2.53125, + "learning_rate": 0.005378385149048358, + "loss": 2.9511, + "mean_token_accuracy": 0.4194824695587158, + "num_tokens": 6837537280.0, + "step": 13375 + }, + { + "epoch": 3.6170903190914006, + "grad_norm": 2.828125, + "learning_rate": 0.005377154114293097, + "loss": 2.7026, + "mean_token_accuracy": 0.4540567696094513, + "num_tokens": 6838022997.0, + "step": 13376 + }, + { + "epoch": 3.6173607355327206, + "grad_norm": 3.0625, + "learning_rate": 0.005375923252063282, + "loss": 2.9205, + "mean_token_accuracy": 0.42931532859802246, + "num_tokens": 6838500373.0, + "step": 13377 + }, + { + "epoch": 3.61763115197404, + "grad_norm": 3.078125, + "learning_rate": 0.005374692562396683, + "loss": 3.0008, + "mean_token_accuracy": 0.4062093496322632, + "num_tokens": 6839024589.0, + "step": 13378 + }, + { + "epoch": 3.61790156841536, + "grad_norm": 2.90625, + "learning_rate": 0.005373462045331059, + "loss": 3.009, + "mean_token_accuracy": 0.42570605874061584, + "num_tokens": 6839548835.0, + "step": 13379 + }, + { + "epoch": 3.618171984856679, + "grad_norm": 3.984375, + "learning_rate": 0.0053722317009041645, + "loss": 2.8951, + "mean_token_accuracy": 0.449140727519989, + "num_tokens": 6839968158.0, + "step": 13380 + }, + { + "epoch": 3.6184424012979988, + "grad_norm": 2.921875, + "learning_rate": 0.0053710015291537514, + "loss": 2.8366, + "mean_token_accuracy": 0.43566519021987915, + "num_tokens": 6840492336.0, + "step": 13381 + }, + { + "epoch": 3.6187128177393184, + "grad_norm": 3.484375, + "learning_rate": 0.005369771530117568, + "loss": 2.9707, + "mean_token_accuracy": 0.4312257170677185, + "num_tokens": 6840950115.0, + "step": 13382 + }, + { + "epoch": 3.618983234180638, + "grad_norm": 3.40625, + "learning_rate": 0.005368541703833348, + "loss": 2.9112, + "mean_token_accuracy": 0.41779541969299316, + "num_tokens": 6841474163.0, + "step": 13383 + }, + { + "epoch": 3.6192536506219577, + "grad_norm": 2.921875, + "learning_rate": 0.005367312050338832, + "loss": 2.8212, + "mean_token_accuracy": 0.4340151846408844, + "num_tokens": 6841998434.0, + "step": 13384 + }, + { + "epoch": 3.6195240670632773, + "grad_norm": 3.3125, + "learning_rate": 0.005366082569671743, + "loss": 3.0293, + "mean_token_accuracy": 0.4092675745487213, + "num_tokens": 6842522705.0, + "step": 13385 + }, + { + "epoch": 3.619794483504597, + "grad_norm": 49.75, + "learning_rate": 0.005364853261869814, + "loss": 3.0031, + "mean_token_accuracy": 0.3977785110473633, + "num_tokens": 6843046968.0, + "step": 13386 + }, + { + "epoch": 3.6200648999459166, + "grad_norm": 4.6875, + "learning_rate": 0.005363624126970754, + "loss": 2.8471, + "mean_token_accuracy": 0.4318809509277344, + "num_tokens": 6843562859.0, + "step": 13387 + }, + { + "epoch": 3.6203353163872363, + "grad_norm": 2.75, + "learning_rate": 0.005362395165012286, + "loss": 2.8585, + "mean_token_accuracy": 0.42386072874069214, + "num_tokens": 6844087050.0, + "step": 13388 + }, + { + "epoch": 3.620605732828556, + "grad_norm": 2.9375, + "learning_rate": 0.00536116637603211, + "loss": 2.9384, + "mean_token_accuracy": 0.47299695014953613, + "num_tokens": 6844546022.0, + "step": 13389 + }, + { + "epoch": 3.6208761492698756, + "grad_norm": 2.984375, + "learning_rate": 0.005359937760067934, + "loss": 2.873, + "mean_token_accuracy": 0.4362899959087372, + "num_tokens": 6845041512.0, + "step": 13390 + }, + { + "epoch": 3.621146565711195, + "grad_norm": 304.0, + "learning_rate": 0.005358709317157455, + "loss": 16.6749, + "mean_token_accuracy": 0.0035266459453850985, + "num_tokens": 6845565679.0, + "step": 13391 + }, + { + "epoch": 3.621416982152515, + "grad_norm": 7.34375, + "learning_rate": 0.005357481047338361, + "loss": 2.9045, + "mean_token_accuracy": 0.4608651399612427, + "num_tokens": 6846037413.0, + "step": 13392 + }, + { + "epoch": 3.6216873985938345, + "grad_norm": 3.734375, + "learning_rate": 0.0053562529506483445, + "loss": 2.8454, + "mean_token_accuracy": 0.4127805233001709, + "num_tokens": 6846561668.0, + "step": 13393 + }, + { + "epoch": 3.621957815035154, + "grad_norm": 2.734375, + "learning_rate": 0.005355025027125085, + "loss": 2.8539, + "mean_token_accuracy": 0.42298126220703125, + "num_tokens": 6847040108.0, + "step": 13394 + }, + { + "epoch": 3.6222282314764738, + "grad_norm": 2.09375, + "learning_rate": 0.005353797276806257, + "loss": 2.8913, + "mean_token_accuracy": 0.42521223425865173, + "num_tokens": 6847564342.0, + "step": 13395 + }, + { + "epoch": 3.6224986479177934, + "grad_norm": 3.328125, + "learning_rate": 0.005352569699729534, + "loss": 2.54, + "mean_token_accuracy": 0.43874597549438477, + "num_tokens": 6848088511.0, + "step": 13396 + }, + { + "epoch": 3.622769064359113, + "grad_norm": 2.828125, + "learning_rate": 0.005351342295932577, + "loss": 2.8746, + "mean_token_accuracy": 0.42566272616386414, + "num_tokens": 6848612699.0, + "step": 13397 + }, + { + "epoch": 3.6230394808004327, + "grad_norm": 2.578125, + "learning_rate": 0.005350115065453055, + "loss": 2.8775, + "mean_token_accuracy": 0.4386257231235504, + "num_tokens": 6849136966.0, + "step": 13398 + }, + { + "epoch": 3.6233098972417523, + "grad_norm": 3.078125, + "learning_rate": 0.005348888008328618, + "loss": 2.714, + "mean_token_accuracy": 0.44692137837409973, + "num_tokens": 6849661212.0, + "step": 13399 + }, + { + "epoch": 3.623580313683072, + "grad_norm": 2.515625, + "learning_rate": 0.005347661124596912, + "loss": 2.7353, + "mean_token_accuracy": 0.45540717244148254, + "num_tokens": 6850185395.0, + "step": 13400 + }, + { + "epoch": 3.6238507301243916, + "grad_norm": 2.859375, + "learning_rate": 0.0053464344142955846, + "loss": 3.1042, + "mean_token_accuracy": 0.41731297969818115, + "num_tokens": 6850709602.0, + "step": 13401 + }, + { + "epoch": 3.6241211465657113, + "grad_norm": 2.65625, + "learning_rate": 0.005345207877462279, + "loss": 2.92, + "mean_token_accuracy": 0.43806007504463196, + "num_tokens": 6851226441.0, + "step": 13402 + }, + { + "epoch": 3.624391563007031, + "grad_norm": 2.515625, + "learning_rate": 0.005343981514134621, + "loss": 2.8053, + "mean_token_accuracy": 0.4223436713218689, + "num_tokens": 6851750714.0, + "step": 13403 + }, + { + "epoch": 3.6246619794483506, + "grad_norm": 2.890625, + "learning_rate": 0.005342755324350248, + "loss": 2.9391, + "mean_token_accuracy": 0.43390488624572754, + "num_tokens": 6852274919.0, + "step": 13404 + }, + { + "epoch": 3.62493239588967, + "grad_norm": 2.890625, + "learning_rate": 0.0053415293081467774, + "loss": 2.8679, + "mean_token_accuracy": 0.42762917280197144, + "num_tokens": 6852799200.0, + "step": 13405 + }, + { + "epoch": 3.62520281233099, + "grad_norm": 3.53125, + "learning_rate": 0.005340303465561826, + "loss": 2.8698, + "mean_token_accuracy": 0.4087774157524109, + "num_tokens": 6853323480.0, + "step": 13406 + }, + { + "epoch": 3.6254732287723095, + "grad_norm": 2.640625, + "learning_rate": 0.0053390777966330115, + "loss": 2.9697, + "mean_token_accuracy": 0.418509840965271, + "num_tokens": 6853847757.0, + "step": 13407 + }, + { + "epoch": 3.625743645213629, + "grad_norm": 3.28125, + "learning_rate": 0.005337852301397938, + "loss": 2.7766, + "mean_token_accuracy": 0.42949992418289185, + "num_tokens": 6854371983.0, + "step": 13408 + }, + { + "epoch": 3.6260140616549488, + "grad_norm": 2.515625, + "learning_rate": 0.005336626979894204, + "loss": 2.9476, + "mean_token_accuracy": 0.4141744375228882, + "num_tokens": 6854896205.0, + "step": 13409 + }, + { + "epoch": 3.6262844780962684, + "grad_norm": 4.4375, + "learning_rate": 0.005335401832159413, + "loss": 3.0454, + "mean_token_accuracy": 0.40031588077545166, + "num_tokens": 6855369191.0, + "step": 13410 + }, + { + "epoch": 3.6265548945375876, + "grad_norm": 76.5, + "learning_rate": 0.00533417685823115, + "loss": 11.8869, + "mean_token_accuracy": 1.4865257981000468e-05, + "num_tokens": 6855893415.0, + "step": 13411 + }, + { + "epoch": 3.6268253109789077, + "grad_norm": 4.78125, + "learning_rate": 0.005332952058147001, + "loss": 3.0997, + "mean_token_accuracy": 0.43063029646873474, + "num_tokens": 6856360083.0, + "step": 13412 + }, + { + "epoch": 3.627095727420227, + "grad_norm": 23.75, + "learning_rate": 0.005331727431944553, + "loss": 2.8909, + "mean_token_accuracy": 0.42638665437698364, + "num_tokens": 6856862487.0, + "step": 13413 + }, + { + "epoch": 3.627366143861547, + "grad_norm": 4.28125, + "learning_rate": 0.005330502979661373, + "loss": 2.9844, + "mean_token_accuracy": 0.42690181732177734, + "num_tokens": 6857386738.0, + "step": 13414 + }, + { + "epoch": 3.627636560302866, + "grad_norm": 3.140625, + "learning_rate": 0.005329278701335037, + "loss": 3.0879, + "mean_token_accuracy": 0.39196377992630005, + "num_tokens": 6857910975.0, + "step": 13415 + }, + { + "epoch": 3.6279069767441863, + "grad_norm": 4.125, + "learning_rate": 0.005328054597003104, + "loss": 2.6814, + "mean_token_accuracy": 0.4238649606704712, + "num_tokens": 6858435224.0, + "step": 13416 + }, + { + "epoch": 3.6281773931855055, + "grad_norm": 2.90625, + "learning_rate": 0.005326830666703137, + "loss": 2.764, + "mean_token_accuracy": 0.4671100974082947, + "num_tokens": 6858959418.0, + "step": 13417 + }, + { + "epoch": 3.6284478096268256, + "grad_norm": 2.203125, + "learning_rate": 0.005325606910472689, + "loss": 2.86, + "mean_token_accuracy": 0.4351673126220703, + "num_tokens": 6859483615.0, + "step": 13418 + }, + { + "epoch": 3.6287182260681448, + "grad_norm": 2.625, + "learning_rate": 0.005324383328349311, + "loss": 2.7272, + "mean_token_accuracy": 0.441415011882782, + "num_tokens": 6859982932.0, + "step": 13419 + }, + { + "epoch": 3.628988642509465, + "grad_norm": 3.328125, + "learning_rate": 0.005323159920370539, + "loss": 2.8691, + "mean_token_accuracy": 0.4311699867248535, + "num_tokens": 6860446781.0, + "step": 13420 + }, + { + "epoch": 3.629259058950784, + "grad_norm": 3.21875, + "learning_rate": 0.005321936686573919, + "loss": 2.6533, + "mean_token_accuracy": 0.4306674301624298, + "num_tokens": 6860970978.0, + "step": 13421 + }, + { + "epoch": 3.6295294753921037, + "grad_norm": 2.703125, + "learning_rate": 0.005320713626996979, + "loss": 2.9405, + "mean_token_accuracy": 0.42384251952171326, + "num_tokens": 6861495261.0, + "step": 13422 + }, + { + "epoch": 3.6297998918334233, + "grad_norm": 3.109375, + "learning_rate": 0.005319490741677244, + "loss": 2.9432, + "mean_token_accuracy": 0.43717724084854126, + "num_tokens": 6862007206.0, + "step": 13423 + }, + { + "epoch": 3.630070308274743, + "grad_norm": 3.03125, + "learning_rate": 0.005318268030652241, + "loss": 2.8971, + "mean_token_accuracy": 0.42024120688438416, + "num_tokens": 6862479741.0, + "step": 13424 + }, + { + "epoch": 3.6303407247160626, + "grad_norm": 2.53125, + "learning_rate": 0.005317045493959485, + "loss": 2.9238, + "mean_token_accuracy": 0.418881893157959, + "num_tokens": 6863003727.0, + "step": 13425 + }, + { + "epoch": 3.6306111411573823, + "grad_norm": 3.03125, + "learning_rate": 0.005315823131636481, + "loss": 2.7882, + "mean_token_accuracy": 0.4308003783226013, + "num_tokens": 6863527887.0, + "step": 13426 + }, + { + "epoch": 3.630881557598702, + "grad_norm": 3.4375, + "learning_rate": 0.005314600943720744, + "loss": 2.9783, + "mean_token_accuracy": 0.4036928713321686, + "num_tokens": 6864052001.0, + "step": 13427 + }, + { + "epoch": 3.6311519740400215, + "grad_norm": 3.75, + "learning_rate": 0.005313378930249765, + "loss": 3.0642, + "mean_token_accuracy": 0.42552614212036133, + "num_tokens": 6864576244.0, + "step": 13428 + }, + { + "epoch": 3.631422390481341, + "grad_norm": 3.734375, + "learning_rate": 0.005312157091261047, + "loss": 2.8937, + "mean_token_accuracy": 0.4128844738006592, + "num_tokens": 6865100443.0, + "step": 13429 + }, + { + "epoch": 3.631692806922661, + "grad_norm": 3.0625, + "learning_rate": 0.005310935426792074, + "loss": 2.8264, + "mean_token_accuracy": 0.4424676299095154, + "num_tokens": 6865566971.0, + "step": 13430 + }, + { + "epoch": 3.6319632233639805, + "grad_norm": 97.0, + "learning_rate": 0.005309713936880336, + "loss": 13.798, + "mean_token_accuracy": 0.04004959762096405, + "num_tokens": 6866091189.0, + "step": 13431 + }, + { + "epoch": 3.6322336398053, + "grad_norm": 5.84375, + "learning_rate": 0.005308492621563305, + "loss": 3.2215, + "mean_token_accuracy": 0.3724827766418457, + "num_tokens": 6866615351.0, + "step": 13432 + }, + { + "epoch": 3.6325040562466198, + "grad_norm": 2.046875, + "learning_rate": 0.005307271480878459, + "loss": 2.9426, + "mean_token_accuracy": 0.42879489064216614, + "num_tokens": 6867118816.0, + "step": 13433 + }, + { + "epoch": 3.6327744726879394, + "grad_norm": 2.421875, + "learning_rate": 0.005306050514863268, + "loss": 2.9613, + "mean_token_accuracy": 0.4357737898826599, + "num_tokens": 6867643034.0, + "step": 13434 + }, + { + "epoch": 3.633044889129259, + "grad_norm": 2.9375, + "learning_rate": 0.005304829723555189, + "loss": 2.9688, + "mean_token_accuracy": 0.41802501678466797, + "num_tokens": 6868159360.0, + "step": 13435 + }, + { + "epoch": 3.6333153055705787, + "grad_norm": 2.6875, + "learning_rate": 0.0053036091069916845, + "loss": 2.8134, + "mean_token_accuracy": 0.4294276535511017, + "num_tokens": 6868683583.0, + "step": 13436 + }, + { + "epoch": 3.6335857220118983, + "grad_norm": 3.5, + "learning_rate": 0.005302388665210204, + "loss": 2.4481, + "mean_token_accuracy": 0.48675432801246643, + "num_tokens": 6869207852.0, + "step": 13437 + }, + { + "epoch": 3.633856138453218, + "grad_norm": 2.515625, + "learning_rate": 0.0053011683982481924, + "loss": 3.0165, + "mean_token_accuracy": 0.3932513892650604, + "num_tokens": 6869732031.0, + "step": 13438 + }, + { + "epoch": 3.6341265548945376, + "grad_norm": 4.53125, + "learning_rate": 0.005299948306143097, + "loss": 2.9891, + "mean_token_accuracy": 0.41034573316574097, + "num_tokens": 6870249653.0, + "step": 13439 + }, + { + "epoch": 3.6343969713358573, + "grad_norm": 2.921875, + "learning_rate": 0.00529872838893235, + "loss": 2.7931, + "mean_token_accuracy": 0.4391212463378906, + "num_tokens": 6870773809.0, + "step": 13440 + }, + { + "epoch": 3.634667387777177, + "grad_norm": 3.15625, + "learning_rate": 0.005297508646653379, + "loss": 2.8768, + "mean_token_accuracy": 0.4301893711090088, + "num_tokens": 6871246754.0, + "step": 13441 + }, + { + "epoch": 3.6349378042184965, + "grad_norm": 2.75, + "learning_rate": 0.005296289079343617, + "loss": 2.8025, + "mean_token_accuracy": 0.44586068391799927, + "num_tokens": 6871770972.0, + "step": 13442 + }, + { + "epoch": 3.635208220659816, + "grad_norm": 2.921875, + "learning_rate": 0.005295069687040474, + "loss": 2.7331, + "mean_token_accuracy": 0.4370715618133545, + "num_tokens": 6872295125.0, + "step": 13443 + }, + { + "epoch": 3.635478637101136, + "grad_norm": 2.53125, + "learning_rate": 0.005293850469781372, + "loss": 2.9026, + "mean_token_accuracy": 0.4349832236766815, + "num_tokens": 6872819230.0, + "step": 13444 + }, + { + "epoch": 3.6357490535424555, + "grad_norm": 3.015625, + "learning_rate": 0.0052926314276037206, + "loss": 2.7992, + "mean_token_accuracy": 0.43438032269477844, + "num_tokens": 6873343485.0, + "step": 13445 + }, + { + "epoch": 3.636019469983775, + "grad_norm": 2.5625, + "learning_rate": 0.005291412560544918, + "loss": 2.8554, + "mean_token_accuracy": 0.4311319589614868, + "num_tokens": 6873867687.0, + "step": 13446 + }, + { + "epoch": 3.6362898864250948, + "grad_norm": 3.421875, + "learning_rate": 0.00529019386864237, + "loss": 2.8449, + "mean_token_accuracy": 0.42560014128685, + "num_tokens": 6874391944.0, + "step": 13447 + }, + { + "epoch": 3.6365603028664144, + "grad_norm": 3.09375, + "learning_rate": 0.005288975351933465, + "loss": 2.983, + "mean_token_accuracy": 0.4141707420349121, + "num_tokens": 6874916118.0, + "step": 13448 + }, + { + "epoch": 3.636830719307734, + "grad_norm": 3.125, + "learning_rate": 0.005287757010455588, + "loss": 2.9216, + "mean_token_accuracy": 0.42607247829437256, + "num_tokens": 6875424173.0, + "step": 13449 + }, + { + "epoch": 3.6371011357490537, + "grad_norm": 2.8125, + "learning_rate": 0.005286538844246127, + "loss": 2.9304, + "mean_token_accuracy": 0.43484413623809814, + "num_tokens": 6875948328.0, + "step": 13450 + }, + { + "epoch": 3.6373715521903733, + "grad_norm": 334.0, + "learning_rate": 0.005285320853342458, + "loss": 25.995, + "mean_token_accuracy": 0.0, + "num_tokens": 6876441322.0, + "step": 13451 + }, + { + "epoch": 3.6376419686316925, + "grad_norm": 6.78125, + "learning_rate": 0.005284103037781947, + "loss": 2.9408, + "mean_token_accuracy": 0.40751659870147705, + "num_tokens": 6876965520.0, + "step": 13452 + }, + { + "epoch": 3.6379123850730126, + "grad_norm": 2.046875, + "learning_rate": 0.005282885397601967, + "loss": 2.7874, + "mean_token_accuracy": 0.44835343956947327, + "num_tokens": 6877461221.0, + "step": 13453 + }, + { + "epoch": 3.638182801514332, + "grad_norm": 2.984375, + "learning_rate": 0.005281667932839878, + "loss": 3.1121, + "mean_token_accuracy": 0.43125635385513306, + "num_tokens": 6877930872.0, + "step": 13454 + }, + { + "epoch": 3.638453217955652, + "grad_norm": 2.4375, + "learning_rate": 0.005280450643533028, + "loss": 2.9646, + "mean_token_accuracy": 0.4339160919189453, + "num_tokens": 6878411025.0, + "step": 13455 + }, + { + "epoch": 3.638723634396971, + "grad_norm": 3.46875, + "learning_rate": 0.005279233529718775, + "loss": 2.8094, + "mean_token_accuracy": 0.47225868701934814, + "num_tokens": 6878885911.0, + "step": 13456 + }, + { + "epoch": 3.638994050838291, + "grad_norm": 2.75, + "learning_rate": 0.00527801659143446, + "loss": 2.7452, + "mean_token_accuracy": 0.4447021484375, + "num_tokens": 6879410167.0, + "step": 13457 + }, + { + "epoch": 3.6392644672796104, + "grad_norm": 3.3125, + "learning_rate": 0.0052767998287174215, + "loss": 2.8149, + "mean_token_accuracy": 0.4177935719490051, + "num_tokens": 6879934235.0, + "step": 13458 + }, + { + "epoch": 3.6395348837209305, + "grad_norm": 3.1875, + "learning_rate": 0.0052755832416049985, + "loss": 2.682, + "mean_token_accuracy": 0.46625250577926636, + "num_tokens": 6880442617.0, + "step": 13459 + }, + { + "epoch": 3.6398053001622497, + "grad_norm": 3.515625, + "learning_rate": 0.005274366830134516, + "loss": 3.0781, + "mean_token_accuracy": 0.4122794270515442, + "num_tokens": 6880966745.0, + "step": 13460 + }, + { + "epoch": 3.6400757166035698, + "grad_norm": 3.3125, + "learning_rate": 0.005273150594343294, + "loss": 2.9911, + "mean_token_accuracy": 0.42308372259140015, + "num_tokens": 6881447441.0, + "step": 13461 + }, + { + "epoch": 3.640346133044889, + "grad_norm": 2.921875, + "learning_rate": 0.005271934534268659, + "loss": 2.8998, + "mean_token_accuracy": 0.41115444898605347, + "num_tokens": 6881971620.0, + "step": 13462 + }, + { + "epoch": 3.6406165494862086, + "grad_norm": 2.90625, + "learning_rate": 0.005270718649947915, + "loss": 2.996, + "mean_token_accuracy": 0.41655611991882324, + "num_tokens": 6882475819.0, + "step": 13463 + }, + { + "epoch": 3.6408869659275283, + "grad_norm": 3.171875, + "learning_rate": 0.005269502941418369, + "loss": 2.7773, + "mean_token_accuracy": 0.41597995162010193, + "num_tokens": 6883000049.0, + "step": 13464 + }, + { + "epoch": 3.641157382368848, + "grad_norm": 2.546875, + "learning_rate": 0.005268287408717328, + "loss": 2.9694, + "mean_token_accuracy": 0.4283050000667572, + "num_tokens": 6883524296.0, + "step": 13465 + }, + { + "epoch": 3.6414277988101675, + "grad_norm": 2.703125, + "learning_rate": 0.005267072051882082, + "loss": 2.816, + "mean_token_accuracy": 0.42847740650177, + "num_tokens": 6884048442.0, + "step": 13466 + }, + { + "epoch": 3.641698215251487, + "grad_norm": 3.234375, + "learning_rate": 0.005265856870949928, + "loss": 2.9082, + "mean_token_accuracy": 0.4410170316696167, + "num_tokens": 6884519618.0, + "step": 13467 + }, + { + "epoch": 3.641968631692807, + "grad_norm": 3.875, + "learning_rate": 0.0052646418659581485, + "loss": 2.8866, + "mean_token_accuracy": 0.43176817893981934, + "num_tokens": 6885043756.0, + "step": 13468 + }, + { + "epoch": 3.6422390481341265, + "grad_norm": 3.078125, + "learning_rate": 0.005263427036944019, + "loss": 2.7796, + "mean_token_accuracy": 0.43450596928596497, + "num_tokens": 6885568035.0, + "step": 13469 + }, + { + "epoch": 3.642509464575446, + "grad_norm": 3.09375, + "learning_rate": 0.0052622123839448235, + "loss": 2.878, + "mean_token_accuracy": 0.43377459049224854, + "num_tokens": 6886080179.0, + "step": 13470 + }, + { + "epoch": 3.6427798810167658, + "grad_norm": 214.0, + "learning_rate": 0.005260997906997822, + "loss": 16.3471, + "mean_token_accuracy": 0.0007996874628588557, + "num_tokens": 6886604446.0, + "step": 13471 + }, + { + "epoch": 3.6430502974580854, + "grad_norm": 7.59375, + "learning_rate": 0.005259783606140283, + "loss": 3.1654, + "mean_token_accuracy": 0.4086247682571411, + "num_tokens": 6887128424.0, + "step": 13472 + }, + { + "epoch": 3.643320713899405, + "grad_norm": 1.84375, + "learning_rate": 0.005258569481409463, + "loss": 2.7798, + "mean_token_accuracy": 0.44066321849823, + "num_tokens": 6887608065.0, + "step": 13473 + }, + { + "epoch": 3.6435911303407247, + "grad_norm": 2.546875, + "learning_rate": 0.005257355532842618, + "loss": 2.8217, + "mean_token_accuracy": 0.44809889793395996, + "num_tokens": 6888123326.0, + "step": 13474 + }, + { + "epoch": 3.6438615467820443, + "grad_norm": 2.328125, + "learning_rate": 0.00525614176047699, + "loss": 2.8705, + "mean_token_accuracy": 0.42977631092071533, + "num_tokens": 6888647544.0, + "step": 13475 + }, + { + "epoch": 3.644131963223364, + "grad_norm": 2.953125, + "learning_rate": 0.005254928164349828, + "loss": 2.8606, + "mean_token_accuracy": 0.4413265585899353, + "num_tokens": 6889171649.0, + "step": 13476 + }, + { + "epoch": 3.6444023796646836, + "grad_norm": 3.078125, + "learning_rate": 0.005253714744498364, + "loss": 2.8884, + "mean_token_accuracy": 0.4307129383087158, + "num_tokens": 6889643276.0, + "step": 13477 + }, + { + "epoch": 3.6446727961060033, + "grad_norm": 3.28125, + "learning_rate": 0.005252501500959829, + "loss": 2.9572, + "mean_token_accuracy": 0.4242607057094574, + "num_tokens": 6890167511.0, + "step": 13478 + }, + { + "epoch": 3.644943212547323, + "grad_norm": 3.671875, + "learning_rate": 0.005251288433771453, + "loss": 2.9192, + "mean_token_accuracy": 0.43278369307518005, + "num_tokens": 6890691779.0, + "step": 13479 + }, + { + "epoch": 3.6452136289886425, + "grad_norm": 3.0625, + "learning_rate": 0.0052500755429704515, + "loss": 2.8617, + "mean_token_accuracy": 0.42913150787353516, + "num_tokens": 6891183372.0, + "step": 13480 + }, + { + "epoch": 3.645484045429962, + "grad_norm": 3.25, + "learning_rate": 0.005248862828594041, + "loss": 2.9978, + "mean_token_accuracy": 0.42372259497642517, + "num_tokens": 6891686504.0, + "step": 13481 + }, + { + "epoch": 3.645754461871282, + "grad_norm": 3.234375, + "learning_rate": 0.005247650290679433, + "loss": 2.7532, + "mean_token_accuracy": 0.46001380681991577, + "num_tokens": 6892159721.0, + "step": 13482 + }, + { + "epoch": 3.6460248783126015, + "grad_norm": 2.859375, + "learning_rate": 0.005246437929263833, + "loss": 2.9042, + "mean_token_accuracy": 0.4275205135345459, + "num_tokens": 6892628889.0, + "step": 13483 + }, + { + "epoch": 3.646295294753921, + "grad_norm": 3.53125, + "learning_rate": 0.005245225744384434, + "loss": 2.8258, + "mean_token_accuracy": 0.4356214702129364, + "num_tokens": 6893153098.0, + "step": 13484 + }, + { + "epoch": 3.6465657111952408, + "grad_norm": 3.25, + "learning_rate": 0.005244013736078432, + "loss": 2.7389, + "mean_token_accuracy": 0.4500235915184021, + "num_tokens": 6893667739.0, + "step": 13485 + }, + { + "epoch": 3.6468361276365604, + "grad_norm": 3.71875, + "learning_rate": 0.005242801904383019, + "loss": 2.8464, + "mean_token_accuracy": 0.40760937333106995, + "num_tokens": 6894192017.0, + "step": 13486 + }, + { + "epoch": 3.64710654407788, + "grad_norm": 24.625, + "learning_rate": 0.005241590249335371, + "loss": 2.9404, + "mean_token_accuracy": 0.4471108317375183, + "num_tokens": 6894716218.0, + "step": 13487 + }, + { + "epoch": 3.6473769605191997, + "grad_norm": 4.375, + "learning_rate": 0.0052403787709726715, + "loss": 3.0923, + "mean_token_accuracy": 0.41386938095092773, + "num_tokens": 6895202898.0, + "step": 13488 + }, + { + "epoch": 3.6476473769605193, + "grad_norm": 2.4375, + "learning_rate": 0.0052391674693320865, + "loss": 2.951, + "mean_token_accuracy": 0.4165363311767578, + "num_tokens": 6895703475.0, + "step": 13489 + }, + { + "epoch": 3.647917793401839, + "grad_norm": 2.484375, + "learning_rate": 0.005237956344450789, + "loss": 2.8897, + "mean_token_accuracy": 0.4105871915817261, + "num_tokens": 6896227701.0, + "step": 13490 + }, + { + "epoch": 3.6481882098431586, + "grad_norm": 107.5, + "learning_rate": 0.005236745396365935, + "loss": 12.5441, + "mean_token_accuracy": 0.0025612383615225554, + "num_tokens": 6896751816.0, + "step": 13491 + }, + { + "epoch": 3.6484586262844783, + "grad_norm": 7.03125, + "learning_rate": 0.005235534625114677, + "loss": 3.0154, + "mean_token_accuracy": 0.4139522910118103, + "num_tokens": 6897276100.0, + "step": 13492 + }, + { + "epoch": 3.6487290427257975, + "grad_norm": 2.171875, + "learning_rate": 0.005234324030734174, + "loss": 2.9838, + "mean_token_accuracy": 0.4251708984375, + "num_tokens": 6897800350.0, + "step": 13493 + }, + { + "epoch": 3.6489994591671175, + "grad_norm": 3.09375, + "learning_rate": 0.005233113613261564, + "loss": 2.6497, + "mean_token_accuracy": 0.45610374212265015, + "num_tokens": 6898324486.0, + "step": 13494 + }, + { + "epoch": 3.6492698756084367, + "grad_norm": 3.375, + "learning_rate": 0.005231903372733986, + "loss": 2.842, + "mean_token_accuracy": 0.4346199333667755, + "num_tokens": 6898847688.0, + "step": 13495 + }, + { + "epoch": 3.649540292049757, + "grad_norm": 2.921875, + "learning_rate": 0.0052306933091885765, + "loss": 2.9888, + "mean_token_accuracy": 0.4172689616680145, + "num_tokens": 6899371972.0, + "step": 13496 + }, + { + "epoch": 3.649810708491076, + "grad_norm": 2.796875, + "learning_rate": 0.005229483422662464, + "loss": 2.6817, + "mean_token_accuracy": 0.4384501874446869, + "num_tokens": 6899896151.0, + "step": 13497 + }, + { + "epoch": 3.650081124932396, + "grad_norm": 2.34375, + "learning_rate": 0.005228273713192768, + "loss": 2.9499, + "mean_token_accuracy": 0.418689101934433, + "num_tokens": 6900414174.0, + "step": 13498 + }, + { + "epoch": 3.6503515413737153, + "grad_norm": 3.421875, + "learning_rate": 0.00522706418081661, + "loss": 2.8677, + "mean_token_accuracy": 0.44696635007858276, + "num_tokens": 6900938370.0, + "step": 13499 + }, + { + "epoch": 3.6506219578150354, + "grad_norm": 3.078125, + "learning_rate": 0.005225854825571097, + "loss": 2.8437, + "mean_token_accuracy": 0.40841591358184814, + "num_tokens": 6901462597.0, + "step": 13500 + }, + { + "epoch": 3.6508923742563546, + "grad_norm": 3.515625, + "learning_rate": 0.005224645647493338, + "loss": 2.873, + "mean_token_accuracy": 0.4291032552719116, + "num_tokens": 6901943482.0, + "step": 13501 + }, + { + "epoch": 3.6511627906976747, + "grad_norm": 3.453125, + "learning_rate": 0.0052234366466204375, + "loss": 2.8663, + "mean_token_accuracy": 0.4374496042728424, + "num_tokens": 6902423343.0, + "step": 13502 + }, + { + "epoch": 3.651433207138994, + "grad_norm": 3.40625, + "learning_rate": 0.005222227822989489, + "loss": 2.6535, + "mean_token_accuracy": 0.4730413556098938, + "num_tokens": 6902947612.0, + "step": 13503 + }, + { + "epoch": 3.6517036235803135, + "grad_norm": 2.15625, + "learning_rate": 0.00522101917663758, + "loss": 2.9128, + "mean_token_accuracy": 0.4363637864589691, + "num_tokens": 6903471895.0, + "step": 13504 + }, + { + "epoch": 3.651974040021633, + "grad_norm": 3.046875, + "learning_rate": 0.0052198107076018, + "loss": 2.7399, + "mean_token_accuracy": 0.4486587941646576, + "num_tokens": 6903960329.0, + "step": 13505 + }, + { + "epoch": 3.652244456462953, + "grad_norm": 2.546875, + "learning_rate": 0.0052186024159192245, + "loss": 2.7004, + "mean_token_accuracy": 0.4166715741157532, + "num_tokens": 6904484437.0, + "step": 13506 + }, + { + "epoch": 3.6525148729042725, + "grad_norm": 2.890625, + "learning_rate": 0.005217394301626928, + "loss": 3.0094, + "mean_token_accuracy": 0.4177866280078888, + "num_tokens": 6905008660.0, + "step": 13507 + }, + { + "epoch": 3.652785289345592, + "grad_norm": 3.234375, + "learning_rate": 0.005216186364761981, + "loss": 2.9371, + "mean_token_accuracy": 0.41738301515579224, + "num_tokens": 6905532781.0, + "step": 13508 + }, + { + "epoch": 3.6530557057869117, + "grad_norm": 2.953125, + "learning_rate": 0.005214978605361447, + "loss": 2.7151, + "mean_token_accuracy": 0.4558316767215729, + "num_tokens": 6906056958.0, + "step": 13509 + }, + { + "epoch": 3.6533261222282314, + "grad_norm": 2.765625, + "learning_rate": 0.005213771023462378, + "loss": 3.0007, + "mean_token_accuracy": 0.41542530059814453, + "num_tokens": 6906581080.0, + "step": 13510 + }, + { + "epoch": 3.653596538669551, + "grad_norm": 16.625, + "learning_rate": 0.005212563619101833, + "loss": 11.3088, + "mean_token_accuracy": 5.416187923401594e-05, + "num_tokens": 6907047759.0, + "step": 13511 + }, + { + "epoch": 3.6538669551108707, + "grad_norm": 6.53125, + "learning_rate": 0.005211356392316855, + "loss": 3.1474, + "mean_token_accuracy": 0.40268605947494507, + "num_tokens": 6907557123.0, + "step": 13512 + }, + { + "epoch": 3.6541373715521903, + "grad_norm": 2.34375, + "learning_rate": 0.005210149343144488, + "loss": 2.9654, + "mean_token_accuracy": 0.4196922183036804, + "num_tokens": 6908081323.0, + "step": 13513 + }, + { + "epoch": 3.65440778799351, + "grad_norm": 27.25, + "learning_rate": 0.005208942471621763, + "loss": 2.9965, + "mean_token_accuracy": 0.4136449992656708, + "num_tokens": 6908605558.0, + "step": 13514 + }, + { + "epoch": 3.6546782044348296, + "grad_norm": 6.3125, + "learning_rate": 0.005207735777785714, + "loss": 3.0663, + "mean_token_accuracy": 0.4032016098499298, + "num_tokens": 6909129806.0, + "step": 13515 + }, + { + "epoch": 3.6549486208761492, + "grad_norm": 2.5625, + "learning_rate": 0.005206529261673365, + "loss": 2.9245, + "mean_token_accuracy": 0.42635780572891235, + "num_tokens": 6909565446.0, + "step": 13516 + }, + { + "epoch": 3.655219037317469, + "grad_norm": 2.390625, + "learning_rate": 0.005205322923321738, + "loss": 2.856, + "mean_token_accuracy": 0.4286094307899475, + "num_tokens": 6910089725.0, + "step": 13517 + }, + { + "epoch": 3.6554894537587885, + "grad_norm": 4.28125, + "learning_rate": 0.005204116762767841, + "loss": 3.0312, + "mean_token_accuracy": 0.41000112891197205, + "num_tokens": 6910613989.0, + "step": 13518 + }, + { + "epoch": 3.655759870200108, + "grad_norm": 2.9375, + "learning_rate": 0.00520291078004869, + "loss": 2.8472, + "mean_token_accuracy": 0.4363616108894348, + "num_tokens": 6911138231.0, + "step": 13519 + }, + { + "epoch": 3.656030286641428, + "grad_norm": 3.8125, + "learning_rate": 0.005201704975201287, + "loss": 2.9608, + "mean_token_accuracy": 0.4154142737388611, + "num_tokens": 6911662398.0, + "step": 13520 + }, + { + "epoch": 3.6563007030827475, + "grad_norm": 2.5625, + "learning_rate": 0.005200499348262619, + "loss": 2.8641, + "mean_token_accuracy": 0.43017223477363586, + "num_tokens": 6912174917.0, + "step": 13521 + }, + { + "epoch": 3.656571119524067, + "grad_norm": 3.6875, + "learning_rate": 0.005199293899269693, + "loss": 2.6866, + "mean_token_accuracy": 0.43575865030288696, + "num_tokens": 6912699087.0, + "step": 13522 + }, + { + "epoch": 3.6568415359653867, + "grad_norm": 2.21875, + "learning_rate": 0.005198088628259488, + "loss": 2.9028, + "mean_token_accuracy": 0.4315958619117737, + "num_tokens": 6913223213.0, + "step": 13523 + }, + { + "epoch": 3.6571119524067064, + "grad_norm": 3.90625, + "learning_rate": 0.005196883535268984, + "loss": 2.984, + "mean_token_accuracy": 0.4165879487991333, + "num_tokens": 6913747488.0, + "step": 13524 + }, + { + "epoch": 3.657382368848026, + "grad_norm": 3.046875, + "learning_rate": 0.0051956786203351596, + "loss": 3.0071, + "mean_token_accuracy": 0.42069578170776367, + "num_tokens": 6914219848.0, + "step": 13525 + }, + { + "epoch": 3.6576527852893457, + "grad_norm": 2.96875, + "learning_rate": 0.005194473883494985, + "loss": 2.8055, + "mean_token_accuracy": 0.41714930534362793, + "num_tokens": 6914706347.0, + "step": 13526 + }, + { + "epoch": 3.6579232017306653, + "grad_norm": 2.859375, + "learning_rate": 0.005193269324785424, + "loss": 3.1093, + "mean_token_accuracy": 0.4255419969558716, + "num_tokens": 6915173253.0, + "step": 13527 + }, + { + "epoch": 3.658193618171985, + "grad_norm": 3.03125, + "learning_rate": 0.005192064944243433, + "loss": 3.0253, + "mean_token_accuracy": 0.40547725558280945, + "num_tokens": 6915697376.0, + "step": 13528 + }, + { + "epoch": 3.6584640346133046, + "grad_norm": 2.75, + "learning_rate": 0.005190860741905976, + "loss": 2.8992, + "mean_token_accuracy": 0.42228275537490845, + "num_tokens": 6916221513.0, + "step": 13529 + }, + { + "epoch": 3.6587344510546242, + "grad_norm": 2.953125, + "learning_rate": 0.005189656717809989, + "loss": 2.9124, + "mean_token_accuracy": 0.4182325601577759, + "num_tokens": 6916745783.0, + "step": 13530 + }, + { + "epoch": 3.659004867495944, + "grad_norm": 55.75, + "learning_rate": 0.0051884528719924276, + "loss": 15.0077, + "mean_token_accuracy": 0.023128895089030266, + "num_tokens": 6917269972.0, + "step": 13531 + }, + { + "epoch": 3.6592752839372635, + "grad_norm": 5.90625, + "learning_rate": 0.005187249204490219, + "loss": 2.9252, + "mean_token_accuracy": 0.4457995295524597, + "num_tokens": 6917736284.0, + "step": 13532 + }, + { + "epoch": 3.659545700378583, + "grad_norm": 3.234375, + "learning_rate": 0.0051860457153402985, + "loss": 2.8608, + "mean_token_accuracy": 0.44039756059646606, + "num_tokens": 6918260564.0, + "step": 13533 + }, + { + "epoch": 3.6598161168199024, + "grad_norm": 2.765625, + "learning_rate": 0.005184842404579595, + "loss": 2.7833, + "mean_token_accuracy": 0.443326473236084, + "num_tokens": 6918784823.0, + "step": 13534 + }, + { + "epoch": 3.6600865332612225, + "grad_norm": 3.21875, + "learning_rate": 0.0051836392722450254, + "loss": 2.8255, + "mean_token_accuracy": 0.4316258430480957, + "num_tokens": 6919309034.0, + "step": 13535 + }, + { + "epoch": 3.6603569497025417, + "grad_norm": 3.015625, + "learning_rate": 0.00518243631837351, + "loss": 2.9045, + "mean_token_accuracy": 0.42867207527160645, + "num_tokens": 6919833253.0, + "step": 13536 + }, + { + "epoch": 3.6606273661438617, + "grad_norm": 4.03125, + "learning_rate": 0.005181233543001957, + "loss": 2.9137, + "mean_token_accuracy": 0.4284645617008209, + "num_tokens": 6920357420.0, + "step": 13537 + }, + { + "epoch": 3.660897782585181, + "grad_norm": 2.890625, + "learning_rate": 0.005180030946167269, + "loss": 2.9448, + "mean_token_accuracy": 0.42072921991348267, + "num_tokens": 6920881668.0, + "step": 13538 + }, + { + "epoch": 3.661168199026501, + "grad_norm": 3.546875, + "learning_rate": 0.005178828527906347, + "loss": 3.0153, + "mean_token_accuracy": 0.44719356298446655, + "num_tokens": 6921294287.0, + "step": 13539 + }, + { + "epoch": 3.6614386154678202, + "grad_norm": 3.15625, + "learning_rate": 0.005177626288256088, + "loss": 3.0398, + "mean_token_accuracy": 0.42669975757598877, + "num_tokens": 6921772631.0, + "step": 13540 + }, + { + "epoch": 3.6617090319091403, + "grad_norm": 2.75, + "learning_rate": 0.005176424227253374, + "loss": 2.6825, + "mean_token_accuracy": 0.45291218161582947, + "num_tokens": 6922296904.0, + "step": 13541 + }, + { + "epoch": 3.6619794483504595, + "grad_norm": 2.765625, + "learning_rate": 0.005175222344935091, + "loss": 3.044, + "mean_token_accuracy": 0.4239528179168701, + "num_tokens": 6922784695.0, + "step": 13542 + }, + { + "epoch": 3.6622498647917796, + "grad_norm": 3.359375, + "learning_rate": 0.005174020641338114, + "loss": 2.8472, + "mean_token_accuracy": 0.43150871992111206, + "num_tokens": 6923308896.0, + "step": 13543 + }, + { + "epoch": 3.662520281233099, + "grad_norm": 2.765625, + "learning_rate": 0.005172819116499318, + "loss": 2.9588, + "mean_token_accuracy": 0.4113757014274597, + "num_tokens": 6923830183.0, + "step": 13544 + }, + { + "epoch": 3.6627906976744184, + "grad_norm": 3.015625, + "learning_rate": 0.00517161777045557, + "loss": 2.8082, + "mean_token_accuracy": 0.4312198758125305, + "num_tokens": 6924354442.0, + "step": 13545 + }, + { + "epoch": 3.663061114115738, + "grad_norm": 3.109375, + "learning_rate": 0.005170416603243728, + "loss": 2.7042, + "mean_token_accuracy": 0.45361560583114624, + "num_tokens": 6924878692.0, + "step": 13546 + }, + { + "epoch": 3.6633315305570577, + "grad_norm": 3.109375, + "learning_rate": 0.005169215614900647, + "loss": 2.6206, + "mean_token_accuracy": 0.4788118898868561, + "num_tokens": 6925402822.0, + "step": 13547 + }, + { + "epoch": 3.6636019469983774, + "grad_norm": 3.28125, + "learning_rate": 0.005168014805463182, + "loss": 2.9166, + "mean_token_accuracy": 0.4296373128890991, + "num_tokens": 6925870187.0, + "step": 13548 + }, + { + "epoch": 3.663872363439697, + "grad_norm": 3.75, + "learning_rate": 0.005166814174968171, + "loss": 3.0375, + "mean_token_accuracy": 0.4303898215293884, + "num_tokens": 6926370420.0, + "step": 13549 + }, + { + "epoch": 3.6641427798810167, + "grad_norm": 3.15625, + "learning_rate": 0.005165613723452454, + "loss": 2.9748, + "mean_token_accuracy": 0.43984246253967285, + "num_tokens": 6926827856.0, + "step": 13550 + }, + { + "epoch": 3.6644131963223363, + "grad_norm": 20.75, + "learning_rate": 0.005164413450952869, + "loss": 11.4541, + "mean_token_accuracy": 8.13096557976678e-05, + "num_tokens": 6927352089.0, + "step": 13551 + }, + { + "epoch": 3.664683612763656, + "grad_norm": 23.625, + "learning_rate": 0.00516321335750624, + "loss": 2.9962, + "mean_token_accuracy": 0.4256803095340729, + "num_tokens": 6927876062.0, + "step": 13552 + }, + { + "epoch": 3.6649540292049756, + "grad_norm": 4.8125, + "learning_rate": 0.005162013443149387, + "loss": 3.1315, + "mean_token_accuracy": 0.40182435512542725, + "num_tokens": 6928400228.0, + "step": 13553 + }, + { + "epoch": 3.6652244456462952, + "grad_norm": 3.0625, + "learning_rate": 0.005160813707919132, + "loss": 2.826, + "mean_token_accuracy": 0.412689745426178, + "num_tokens": 6928883064.0, + "step": 13554 + }, + { + "epoch": 3.665494862087615, + "grad_norm": 2.8125, + "learning_rate": 0.005159614151852283, + "loss": 2.6815, + "mean_token_accuracy": 0.47147563099861145, + "num_tokens": 6929407219.0, + "step": 13555 + }, + { + "epoch": 3.6657652785289345, + "grad_norm": 2.734375, + "learning_rate": 0.00515841477498565, + "loss": 2.9024, + "mean_token_accuracy": 0.42741861939430237, + "num_tokens": 6929931475.0, + "step": 13556 + }, + { + "epoch": 3.666035694970254, + "grad_norm": 3.46875, + "learning_rate": 0.005157215577356027, + "loss": 3.0123, + "mean_token_accuracy": 0.425930917263031, + "num_tokens": 6930406282.0, + "step": 13557 + }, + { + "epoch": 3.666306111411574, + "grad_norm": 2.921875, + "learning_rate": 0.0051560165590002136, + "loss": 2.674, + "mean_token_accuracy": 0.4521692395210266, + "num_tokens": 6930889737.0, + "step": 13558 + }, + { + "epoch": 3.6665765278528935, + "grad_norm": 3.90625, + "learning_rate": 0.005154817719955001, + "loss": 2.652, + "mean_token_accuracy": 0.44278469681739807, + "num_tokens": 6931376533.0, + "step": 13559 + }, + { + "epoch": 3.666846944294213, + "grad_norm": 3.0, + "learning_rate": 0.005153619060257169, + "loss": 3.0898, + "mean_token_accuracy": 0.4049878716468811, + "num_tokens": 6931900579.0, + "step": 13560 + }, + { + "epoch": 3.6671173607355327, + "grad_norm": 3.421875, + "learning_rate": 0.005152420579943496, + "loss": 3.0367, + "mean_token_accuracy": 0.40415865182876587, + "num_tokens": 6932424787.0, + "step": 13561 + }, + { + "epoch": 3.6673877771768524, + "grad_norm": 3.390625, + "learning_rate": 0.005151222279050758, + "loss": 2.8696, + "mean_token_accuracy": 0.4146678149700165, + "num_tokens": 6932915820.0, + "step": 13562 + }, + { + "epoch": 3.667658193618172, + "grad_norm": 2.796875, + "learning_rate": 0.00515002415761572, + "loss": 2.9128, + "mean_token_accuracy": 0.4303084909915924, + "num_tokens": 6933440061.0, + "step": 13563 + }, + { + "epoch": 3.6679286100594917, + "grad_norm": 2.453125, + "learning_rate": 0.005148826215675144, + "loss": 2.8189, + "mean_token_accuracy": 0.403961181640625, + "num_tokens": 6933964135.0, + "step": 13564 + }, + { + "epoch": 3.6681990265008113, + "grad_norm": 2.34375, + "learning_rate": 0.005147628453265788, + "loss": 3.0295, + "mean_token_accuracy": 0.42865052819252014, + "num_tokens": 6934432540.0, + "step": 13565 + }, + { + "epoch": 3.668469442942131, + "grad_norm": 4.40625, + "learning_rate": 0.005146430870424402, + "loss": 2.6495, + "mean_token_accuracy": 0.44190752506256104, + "num_tokens": 6934956617.0, + "step": 13566 + }, + { + "epoch": 3.6687398593834506, + "grad_norm": 2.34375, + "learning_rate": 0.0051452334671877285, + "loss": 3.0338, + "mean_token_accuracy": 0.42821332812309265, + "num_tokens": 6935480767.0, + "step": 13567 + }, + { + "epoch": 3.6690102758247702, + "grad_norm": 7.4375, + "learning_rate": 0.005144036243592514, + "loss": 2.8054, + "mean_token_accuracy": 0.4271760582923889, + "num_tokens": 6936005033.0, + "step": 13568 + }, + { + "epoch": 3.66928069226609, + "grad_norm": 2.28125, + "learning_rate": 0.005142839199675487, + "loss": 2.7264, + "mean_token_accuracy": 0.41604000329971313, + "num_tokens": 6936529283.0, + "step": 13569 + }, + { + "epoch": 3.6695511087074095, + "grad_norm": 2.796875, + "learning_rate": 0.005141642335473376, + "loss": 3.0835, + "mean_token_accuracy": 0.41307827830314636, + "num_tokens": 6937051515.0, + "step": 13570 + }, + { + "epoch": 3.669821525148729, + "grad_norm": 1.3671875, + "learning_rate": 0.005140445651022906, + "loss": 10.9266, + "mean_token_accuracy": 2.5927252863766626e-05, + "num_tokens": 6937575798.0, + "step": 13571 + }, + { + "epoch": 3.670091941590049, + "grad_norm": 10.375, + "learning_rate": 0.005139249146360798, + "loss": 2.9774, + "mean_token_accuracy": 0.4285186231136322, + "num_tokens": 6938099995.0, + "step": 13572 + }, + { + "epoch": 3.6703623580313685, + "grad_norm": 2.71875, + "learning_rate": 0.00513805282152376, + "loss": 3.0339, + "mean_token_accuracy": 0.4187260866165161, + "num_tokens": 6938624267.0, + "step": 13573 + }, + { + "epoch": 3.670632774472688, + "grad_norm": 3.046875, + "learning_rate": 0.0051368566765485025, + "loss": 2.899, + "mean_token_accuracy": 0.4266993999481201, + "num_tokens": 6939148495.0, + "step": 13574 + }, + { + "epoch": 3.6709031909140073, + "grad_norm": 2.375, + "learning_rate": 0.005135660711471724, + "loss": 2.9523, + "mean_token_accuracy": 0.42678776383399963, + "num_tokens": 6939672680.0, + "step": 13575 + }, + { + "epoch": 3.6711736073553274, + "grad_norm": 3.125, + "learning_rate": 0.005134464926330118, + "loss": 2.7973, + "mean_token_accuracy": 0.41598284244537354, + "num_tokens": 6940196894.0, + "step": 13576 + }, + { + "epoch": 3.6714440237966466, + "grad_norm": 3.3125, + "learning_rate": 0.00513326932116038, + "loss": 2.8442, + "mean_token_accuracy": 0.4230261445045471, + "num_tokens": 6940721167.0, + "step": 13577 + }, + { + "epoch": 3.6717144402379667, + "grad_norm": 3.5625, + "learning_rate": 0.005132073895999193, + "loss": 2.9299, + "mean_token_accuracy": 0.415766179561615, + "num_tokens": 6941245434.0, + "step": 13578 + }, + { + "epoch": 3.671984856679286, + "grad_norm": 10.6875, + "learning_rate": 0.005130878650883233, + "loss": 2.6773, + "mean_token_accuracy": 0.4673764705657959, + "num_tokens": 6941769583.0, + "step": 13579 + }, + { + "epoch": 3.672255273120606, + "grad_norm": 3.0625, + "learning_rate": 0.005129683585849178, + "loss": 2.8834, + "mean_token_accuracy": 0.44319450855255127, + "num_tokens": 6942293760.0, + "step": 13580 + }, + { + "epoch": 3.672525689561925, + "grad_norm": 2.875, + "learning_rate": 0.005128488700933691, + "loss": 2.9858, + "mean_token_accuracy": 0.4230215847492218, + "num_tokens": 6942817951.0, + "step": 13581 + }, + { + "epoch": 3.6727961060032452, + "grad_norm": 2.984375, + "learning_rate": 0.005127293996173441, + "loss": 2.7599, + "mean_token_accuracy": 0.4482845962047577, + "num_tokens": 6943342003.0, + "step": 13582 + }, + { + "epoch": 3.6730665224445644, + "grad_norm": 2.75, + "learning_rate": 0.005126099471605082, + "loss": 2.954, + "mean_token_accuracy": 0.43224048614501953, + "num_tokens": 6943857640.0, + "step": 13583 + }, + { + "epoch": 3.6733369388858845, + "grad_norm": 3.84375, + "learning_rate": 0.00512490512726526, + "loss": 3.0521, + "mean_token_accuracy": 0.42810583114624023, + "num_tokens": 6944357870.0, + "step": 13584 + }, + { + "epoch": 3.6736073553272037, + "grad_norm": 3.375, + "learning_rate": 0.005123710963190631, + "loss": 2.8559, + "mean_token_accuracy": 0.4234168529510498, + "num_tokens": 6944882147.0, + "step": 13585 + }, + { + "epoch": 3.6738777717685234, + "grad_norm": 2.96875, + "learning_rate": 0.005122516979417829, + "loss": 2.8658, + "mean_token_accuracy": 0.433329701423645, + "num_tokens": 6945387669.0, + "step": 13586 + }, + { + "epoch": 3.674148188209843, + "grad_norm": 4.0625, + "learning_rate": 0.005121323175983488, + "loss": 3.0647, + "mean_token_accuracy": 0.4010298252105713, + "num_tokens": 6945911828.0, + "step": 13587 + }, + { + "epoch": 3.6744186046511627, + "grad_norm": 3.125, + "learning_rate": 0.005120129552924244, + "loss": 2.9355, + "mean_token_accuracy": 0.42055651545524597, + "num_tokens": 6946436073.0, + "step": 13588 + }, + { + "epoch": 3.6746890210924823, + "grad_norm": 3.671875, + "learning_rate": 0.005118936110276718, + "loss": 3.0141, + "mean_token_accuracy": 0.41717860102653503, + "num_tokens": 6946960354.0, + "step": 13589 + }, + { + "epoch": 3.674959437533802, + "grad_norm": 3.296875, + "learning_rate": 0.005117742848077522, + "loss": 2.8107, + "mean_token_accuracy": 0.4362625479698181, + "num_tokens": 6947484548.0, + "step": 13590 + }, + { + "epoch": 3.6752298539751216, + "grad_norm": 7.53125, + "learning_rate": 0.005116549766363279, + "loss": 9.8583, + "mean_token_accuracy": 0.006776802241802216, + "num_tokens": 6947937827.0, + "step": 13591 + }, + { + "epoch": 3.6755002704164412, + "grad_norm": 7.59375, + "learning_rate": 0.00511535686517059, + "loss": 3.1496, + "mean_token_accuracy": 0.4035971164703369, + "num_tokens": 6948462079.0, + "step": 13592 + }, + { + "epoch": 3.675770686857761, + "grad_norm": 3.265625, + "learning_rate": 0.005114164144536055, + "loss": 2.9634, + "mean_token_accuracy": 0.43671172857284546, + "num_tokens": 6948952222.0, + "step": 13593 + }, + { + "epoch": 3.6760411032990805, + "grad_norm": 3.1875, + "learning_rate": 0.0051129716044962786, + "loss": 3.0198, + "mean_token_accuracy": 0.41111841797828674, + "num_tokens": 6949476499.0, + "step": 13594 + }, + { + "epoch": 3.6763115197404, + "grad_norm": 3.09375, + "learning_rate": 0.0051117792450878444, + "loss": 2.7608, + "mean_token_accuracy": 0.4300123453140259, + "num_tokens": 6950000745.0, + "step": 13595 + }, + { + "epoch": 3.67658193618172, + "grad_norm": 3.0, + "learning_rate": 0.0051105870663473345, + "loss": 3.1173, + "mean_token_accuracy": 0.42567092180252075, + "num_tokens": 6950468787.0, + "step": 13596 + }, + { + "epoch": 3.6768523526230394, + "grad_norm": 3.40625, + "learning_rate": 0.005109395068311338, + "loss": 2.968, + "mean_token_accuracy": 0.42272546887397766, + "num_tokens": 6950896860.0, + "step": 13597 + }, + { + "epoch": 3.677122769064359, + "grad_norm": 3.71875, + "learning_rate": 0.00510820325101642, + "loss": 2.8657, + "mean_token_accuracy": 0.4247409701347351, + "num_tokens": 6951384517.0, + "step": 13598 + }, + { + "epoch": 3.6773931855056787, + "grad_norm": 2.796875, + "learning_rate": 0.005107011614499157, + "loss": 2.8118, + "mean_token_accuracy": 0.4410974383354187, + "num_tokens": 6951883597.0, + "step": 13599 + }, + { + "epoch": 3.6776636019469984, + "grad_norm": 3.453125, + "learning_rate": 0.005105820158796103, + "loss": 2.9152, + "mean_token_accuracy": 0.4242144823074341, + "num_tokens": 6952407861.0, + "step": 13600 + }, + { + "epoch": 3.677934018388318, + "grad_norm": 3.15625, + "learning_rate": 0.005104628883943825, + "loss": 2.8648, + "mean_token_accuracy": 0.427971750497818, + "num_tokens": 6952932008.0, + "step": 13601 + }, + { + "epoch": 3.6782044348296377, + "grad_norm": 3.515625, + "learning_rate": 0.005103437789978867, + "loss": 2.7238, + "mean_token_accuracy": 0.4385855197906494, + "num_tokens": 6953456291.0, + "step": 13602 + }, + { + "epoch": 3.6784748512709573, + "grad_norm": 2.828125, + "learning_rate": 0.00510224687693778, + "loss": 2.8891, + "mean_token_accuracy": 0.43238365650177, + "num_tokens": 6953980570.0, + "step": 13603 + }, + { + "epoch": 3.678745267712277, + "grad_norm": 3.9375, + "learning_rate": 0.005101056144857099, + "loss": 2.9704, + "mean_token_accuracy": 0.4257287383079529, + "num_tokens": 6954504710.0, + "step": 13604 + }, + { + "epoch": 3.6790156841535966, + "grad_norm": 2.953125, + "learning_rate": 0.005099865593773368, + "loss": 2.9576, + "mean_token_accuracy": 0.4149984121322632, + "num_tokens": 6955028786.0, + "step": 13605 + }, + { + "epoch": 3.6792861005949162, + "grad_norm": 2.5, + "learning_rate": 0.005098675223723112, + "loss": 2.9209, + "mean_token_accuracy": 0.4197039008140564, + "num_tokens": 6955552946.0, + "step": 13606 + }, + { + "epoch": 3.679556517036236, + "grad_norm": 2.75, + "learning_rate": 0.005097485034742851, + "loss": 2.8311, + "mean_token_accuracy": 0.4249647855758667, + "num_tokens": 6956077102.0, + "step": 13607 + }, + { + "epoch": 3.6798269334775555, + "grad_norm": 5.21875, + "learning_rate": 0.00509629502686911, + "loss": 2.7862, + "mean_token_accuracy": 0.4480458199977875, + "num_tokens": 6956601184.0, + "step": 13608 + }, + { + "epoch": 3.680097349918875, + "grad_norm": 1.84375, + "learning_rate": 0.005095105200138401, + "loss": 2.9477, + "mean_token_accuracy": 0.4446195065975189, + "num_tokens": 6957062790.0, + "step": 13609 + }, + { + "epoch": 3.680367766360195, + "grad_norm": 2.875, + "learning_rate": 0.005093915554587225, + "loss": 2.7677, + "mean_token_accuracy": 0.4445416033267975, + "num_tokens": 6957524102.0, + "step": 13610 + }, + { + "epoch": 3.6806381828015144, + "grad_norm": 28.5, + "learning_rate": 0.005092726090252091, + "loss": 12.094, + "mean_token_accuracy": 0.04386197775602341, + "num_tokens": 6958048368.0, + "step": 13611 + }, + { + "epoch": 3.680908599242834, + "grad_norm": 6.34375, + "learning_rate": 0.005091536807169494, + "loss": 2.9464, + "mean_token_accuracy": 0.4127577245235443, + "num_tokens": 6958572554.0, + "step": 13612 + }, + { + "epoch": 3.6811790156841537, + "grad_norm": 2.265625, + "learning_rate": 0.005090347705375919, + "loss": 2.9898, + "mean_token_accuracy": 0.4265214800834656, + "num_tokens": 6959096741.0, + "step": 13613 + }, + { + "epoch": 3.6814494321254734, + "grad_norm": 2.9375, + "learning_rate": 0.0050891587849078565, + "loss": 2.8865, + "mean_token_accuracy": 0.43833938241004944, + "num_tokens": 6959540300.0, + "step": 13614 + }, + { + "epoch": 3.681719848566793, + "grad_norm": 3.421875, + "learning_rate": 0.005087970045801788, + "loss": 2.9299, + "mean_token_accuracy": 0.4295980632305145, + "num_tokens": 6959995953.0, + "step": 13615 + }, + { + "epoch": 3.681990265008112, + "grad_norm": 2.75, + "learning_rate": 0.005086781488094183, + "loss": 3.0482, + "mean_token_accuracy": 0.4082890450954437, + "num_tokens": 6960520137.0, + "step": 13616 + }, + { + "epoch": 3.6822606814494323, + "grad_norm": 2.890625, + "learning_rate": 0.0050855931118215115, + "loss": 2.7401, + "mean_token_accuracy": 0.4559016525745392, + "num_tokens": 6960980804.0, + "step": 13617 + }, + { + "epoch": 3.6825310978907515, + "grad_norm": 3.046875, + "learning_rate": 0.005084404917020239, + "loss": 3.0469, + "mean_token_accuracy": 0.42712166905403137, + "num_tokens": 6961451433.0, + "step": 13618 + }, + { + "epoch": 3.6828015143320716, + "grad_norm": 3.9375, + "learning_rate": 0.005083216903726817, + "loss": 3.0796, + "mean_token_accuracy": 0.41350769996643066, + "num_tokens": 6961975653.0, + "step": 13619 + }, + { + "epoch": 3.683071930773391, + "grad_norm": 2.890625, + "learning_rate": 0.0050820290719777025, + "loss": 2.77, + "mean_token_accuracy": 0.4256809651851654, + "num_tokens": 6962499907.0, + "step": 13620 + }, + { + "epoch": 3.683342347214711, + "grad_norm": 2.53125, + "learning_rate": 0.005080841421809341, + "loss": 2.8148, + "mean_token_accuracy": 0.4480201005935669, + "num_tokens": 6962967614.0, + "step": 13621 + }, + { + "epoch": 3.68361276365603, + "grad_norm": 3.5625, + "learning_rate": 0.005079653953258168, + "loss": 3.1461, + "mean_token_accuracy": 0.4019780158996582, + "num_tokens": 6963491883.0, + "step": 13622 + }, + { + "epoch": 3.68388318009735, + "grad_norm": 2.6875, + "learning_rate": 0.005078466666360624, + "loss": 2.4766, + "mean_token_accuracy": 0.49836111068725586, + "num_tokens": 6964015973.0, + "step": 13623 + }, + { + "epoch": 3.6841535965386694, + "grad_norm": 3.25, + "learning_rate": 0.00507727956115314, + "loss": 3.0555, + "mean_token_accuracy": 0.4031786024570465, + "num_tokens": 6964540179.0, + "step": 13624 + }, + { + "epoch": 3.6844240129799894, + "grad_norm": 3.21875, + "learning_rate": 0.005076092637672132, + "loss": 2.9649, + "mean_token_accuracy": 0.4215172827243805, + "num_tokens": 6965064402.0, + "step": 13625 + }, + { + "epoch": 3.6846944294213086, + "grad_norm": 2.96875, + "learning_rate": 0.005074905895954025, + "loss": 2.9322, + "mean_token_accuracy": 0.4063242971897125, + "num_tokens": 6965588590.0, + "step": 13626 + }, + { + "epoch": 3.6849648458626283, + "grad_norm": 2.609375, + "learning_rate": 0.005073719336035227, + "loss": 2.9203, + "mean_token_accuracy": 0.41813039779663086, + "num_tokens": 6966112835.0, + "step": 13627 + }, + { + "epoch": 3.685235262303948, + "grad_norm": 2.84375, + "learning_rate": 0.005072532957952148, + "loss": 3.0275, + "mean_token_accuracy": 0.42727863788604736, + "num_tokens": 6966637011.0, + "step": 13628 + }, + { + "epoch": 3.6855056787452676, + "grad_norm": 2.90625, + "learning_rate": 0.005071346761741194, + "loss": 2.8837, + "mean_token_accuracy": 0.4070788025856018, + "num_tokens": 6967161231.0, + "step": 13629 + }, + { + "epoch": 3.685776095186587, + "grad_norm": 2.765625, + "learning_rate": 0.005070160747438752, + "loss": 2.9364, + "mean_token_accuracy": 0.42562711238861084, + "num_tokens": 6967685458.0, + "step": 13630 + }, + { + "epoch": 3.686046511627907, + "grad_norm": 23.0, + "learning_rate": 0.0050689749150812195, + "loss": 12.4922, + "mean_token_accuracy": 0.0, + "num_tokens": 6968188760.0, + "step": 13631 + }, + { + "epoch": 3.6863169280692265, + "grad_norm": 7.34375, + "learning_rate": 0.005067789264704978, + "loss": 2.908, + "mean_token_accuracy": 0.39209532737731934, + "num_tokens": 6968713019.0, + "step": 13632 + }, + { + "epoch": 3.686587344510546, + "grad_norm": 2.84375, + "learning_rate": 0.005066603796346405, + "loss": 2.9771, + "mean_token_accuracy": 0.4203602075576782, + "num_tokens": 6969237182.0, + "step": 13633 + }, + { + "epoch": 3.686857760951866, + "grad_norm": 3.5, + "learning_rate": 0.0050654185100418815, + "loss": 2.9584, + "mean_token_accuracy": 0.4314517378807068, + "num_tokens": 6969678389.0, + "step": 13634 + }, + { + "epoch": 3.6871281773931854, + "grad_norm": 3.53125, + "learning_rate": 0.005064233405827768, + "loss": 2.8271, + "mean_token_accuracy": 0.44685447216033936, + "num_tokens": 6970189536.0, + "step": 13635 + }, + { + "epoch": 3.687398593834505, + "grad_norm": 3.03125, + "learning_rate": 0.005063048483740427, + "loss": 2.8828, + "mean_token_accuracy": 0.4388502836227417, + "num_tokens": 6970713757.0, + "step": 13636 + }, + { + "epoch": 3.6876690102758247, + "grad_norm": 2.75, + "learning_rate": 0.005061863743816222, + "loss": 2.8605, + "mean_token_accuracy": 0.4254177212715149, + "num_tokens": 6971237933.0, + "step": 13637 + }, + { + "epoch": 3.6879394267171444, + "grad_norm": 3.09375, + "learning_rate": 0.005060679186091499, + "loss": 2.7989, + "mean_token_accuracy": 0.422808974981308, + "num_tokens": 6971762202.0, + "step": 13638 + }, + { + "epoch": 3.688209843158464, + "grad_norm": 2.875, + "learning_rate": 0.005059494810602603, + "loss": 2.7793, + "mean_token_accuracy": 0.4249400496482849, + "num_tokens": 6972256559.0, + "step": 13639 + }, + { + "epoch": 3.6884802595997837, + "grad_norm": 2.890625, + "learning_rate": 0.005058310617385878, + "loss": 2.8794, + "mean_token_accuracy": 0.41989362239837646, + "num_tokens": 6972780665.0, + "step": 13640 + }, + { + "epoch": 3.6887506760411033, + "grad_norm": 3.078125, + "learning_rate": 0.005057126606477656, + "loss": 2.9499, + "mean_token_accuracy": 0.41708025336265564, + "num_tokens": 6973304950.0, + "step": 13641 + }, + { + "epoch": 3.689021092482423, + "grad_norm": 4.4375, + "learning_rate": 0.005055942777914267, + "loss": 3.118, + "mean_token_accuracy": 0.3936951458454132, + "num_tokens": 6973829169.0, + "step": 13642 + }, + { + "epoch": 3.6892915089237426, + "grad_norm": 3.359375, + "learning_rate": 0.005054759131732032, + "loss": 2.8348, + "mean_token_accuracy": 0.4264504909515381, + "num_tokens": 6974325252.0, + "step": 13643 + }, + { + "epoch": 3.6895619253650622, + "grad_norm": 3.265625, + "learning_rate": 0.005053575667967274, + "loss": 2.7637, + "mean_token_accuracy": 0.43382591009140015, + "num_tokens": 6974849529.0, + "step": 13644 + }, + { + "epoch": 3.689832341806382, + "grad_norm": 3.484375, + "learning_rate": 0.005052392386656297, + "loss": 2.7301, + "mean_token_accuracy": 0.44322723150253296, + "num_tokens": 6975373712.0, + "step": 13645 + }, + { + "epoch": 3.6901027582477015, + "grad_norm": 3.078125, + "learning_rate": 0.005051209287835416, + "loss": 2.9237, + "mean_token_accuracy": 0.4324544072151184, + "num_tokens": 6975882400.0, + "step": 13646 + }, + { + "epoch": 3.690373174689021, + "grad_norm": 3.46875, + "learning_rate": 0.00505002637154093, + "loss": 2.8985, + "mean_token_accuracy": 0.44045692682266235, + "num_tokens": 6976345687.0, + "step": 13647 + }, + { + "epoch": 3.690643591130341, + "grad_norm": 3.71875, + "learning_rate": 0.005048843637809128, + "loss": 2.8574, + "mean_token_accuracy": 0.4377421736717224, + "num_tokens": 6976869909.0, + "step": 13648 + }, + { + "epoch": 3.6909140075716604, + "grad_norm": 2.890625, + "learning_rate": 0.0050476610866763095, + "loss": 2.9622, + "mean_token_accuracy": 0.41805028915405273, + "num_tokens": 6977360009.0, + "step": 13649 + }, + { + "epoch": 3.69118442401298, + "grad_norm": 3.0625, + "learning_rate": 0.005046478718178748, + "loss": 2.758, + "mean_token_accuracy": 0.44326403737068176, + "num_tokens": 6977884128.0, + "step": 13650 + }, + { + "epoch": 3.6914548404542997, + "grad_norm": 14.5, + "learning_rate": 0.005045296532352731, + "loss": 10.5073, + "mean_token_accuracy": 0.011408871039748192, + "num_tokens": 6978408287.0, + "step": 13651 + }, + { + "epoch": 3.6917252568956194, + "grad_norm": 8.3125, + "learning_rate": 0.005044114529234529, + "loss": 3.0559, + "mean_token_accuracy": 0.40134379267692566, + "num_tokens": 6978932420.0, + "step": 13652 + }, + { + "epoch": 3.691995673336939, + "grad_norm": 2.046875, + "learning_rate": 0.005042932708860405, + "loss": 2.8737, + "mean_token_accuracy": 0.43948638439178467, + "num_tokens": 6979450386.0, + "step": 13653 + }, + { + "epoch": 3.6922660897782587, + "grad_norm": 2.734375, + "learning_rate": 0.005041751071266627, + "loss": 2.8331, + "mean_token_accuracy": 0.44799140095710754, + "num_tokens": 6979974603.0, + "step": 13654 + }, + { + "epoch": 3.6925365062195783, + "grad_norm": 3.171875, + "learning_rate": 0.00504056961648945, + "loss": 2.9516, + "mean_token_accuracy": 0.4328371286392212, + "num_tokens": 6980468823.0, + "step": 13655 + }, + { + "epoch": 3.692806922660898, + "grad_norm": 3.59375, + "learning_rate": 0.005039388344565117, + "loss": 2.6827, + "mean_token_accuracy": 0.42031246423721313, + "num_tokens": 6980985150.0, + "step": 13656 + }, + { + "epoch": 3.693077339102217, + "grad_norm": 2.90625, + "learning_rate": 0.0050382072555298805, + "loss": 2.9282, + "mean_token_accuracy": 0.4256494343280792, + "num_tokens": 6981463766.0, + "step": 13657 + }, + { + "epoch": 3.6933477555435372, + "grad_norm": 3.421875, + "learning_rate": 0.00503702634941998, + "loss": 2.9462, + "mean_token_accuracy": 0.4277678430080414, + "num_tokens": 6981988039.0, + "step": 13658 + }, + { + "epoch": 3.6936181719848564, + "grad_norm": 2.796875, + "learning_rate": 0.005035845626271644, + "loss": 2.7788, + "mean_token_accuracy": 0.43596094846725464, + "num_tokens": 6982512170.0, + "step": 13659 + }, + { + "epoch": 3.6938885884261765, + "grad_norm": 2.9375, + "learning_rate": 0.005034665086121107, + "loss": 2.9138, + "mean_token_accuracy": 0.42894378304481506, + "num_tokens": 6983036318.0, + "step": 13660 + }, + { + "epoch": 3.6941590048674957, + "grad_norm": 3.25, + "learning_rate": 0.005033484729004587, + "loss": 3.035, + "mean_token_accuracy": 0.4168640971183777, + "num_tokens": 6983560514.0, + "step": 13661 + }, + { + "epoch": 3.694429421308816, + "grad_norm": 2.921875, + "learning_rate": 0.005032304554958299, + "loss": 2.9665, + "mean_token_accuracy": 0.43031150102615356, + "num_tokens": 6984049426.0, + "step": 13662 + }, + { + "epoch": 3.694699837750135, + "grad_norm": 3.15625, + "learning_rate": 0.005031124564018461, + "loss": 2.9029, + "mean_token_accuracy": 0.43615856766700745, + "num_tokens": 6984573597.0, + "step": 13663 + }, + { + "epoch": 3.694970254191455, + "grad_norm": 3.234375, + "learning_rate": 0.005029944756221274, + "loss": 2.7755, + "mean_token_accuracy": 0.43996840715408325, + "num_tokens": 6985097773.0, + "step": 13664 + }, + { + "epoch": 3.6952406706327743, + "grad_norm": 2.765625, + "learning_rate": 0.005028765131602937, + "loss": 2.9724, + "mean_token_accuracy": 0.4276471734046936, + "num_tokens": 6985592019.0, + "step": 13665 + }, + { + "epoch": 3.6955110870740944, + "grad_norm": 3.1875, + "learning_rate": 0.005027585690199648, + "loss": 3.0356, + "mean_token_accuracy": 0.4371746778488159, + "num_tokens": 6986082243.0, + "step": 13666 + }, + { + "epoch": 3.6957815035154136, + "grad_norm": 3.28125, + "learning_rate": 0.005026406432047594, + "loss": 2.9623, + "mean_token_accuracy": 0.4279385209083557, + "num_tokens": 6986596642.0, + "step": 13667 + }, + { + "epoch": 3.696051919956733, + "grad_norm": 2.671875, + "learning_rate": 0.005025227357182955, + "loss": 2.8047, + "mean_token_accuracy": 0.4483209252357483, + "num_tokens": 6987103982.0, + "step": 13668 + }, + { + "epoch": 3.696322336398053, + "grad_norm": 3.421875, + "learning_rate": 0.005024048465641914, + "loss": 2.8003, + "mean_token_accuracy": 0.4158324599266052, + "num_tokens": 6987628128.0, + "step": 13669 + }, + { + "epoch": 3.6965927528393725, + "grad_norm": 2.515625, + "learning_rate": 0.005022869757460638, + "loss": 2.7794, + "mean_token_accuracy": 0.44449755549430847, + "num_tokens": 6988152272.0, + "step": 13670 + }, + { + "epoch": 3.696863169280692, + "grad_norm": 4.71875, + "learning_rate": 0.005021691232675294, + "loss": 9.7933, + "mean_token_accuracy": 0.030034195631742477, + "num_tokens": 6988618410.0, + "step": 13671 + }, + { + "epoch": 3.697133585722012, + "grad_norm": 7.25, + "learning_rate": 0.005020512891322047, + "loss": 2.9334, + "mean_token_accuracy": 0.4034341275691986, + "num_tokens": 6989142661.0, + "step": 13672 + }, + { + "epoch": 3.6974040021633314, + "grad_norm": 2.0625, + "learning_rate": 0.005019334733437047, + "loss": 2.7325, + "mean_token_accuracy": 0.4375183582305908, + "num_tokens": 6989666748.0, + "step": 13673 + }, + { + "epoch": 3.697674418604651, + "grad_norm": 2.953125, + "learning_rate": 0.005018156759056448, + "loss": 2.9763, + "mean_token_accuracy": 0.41696879267692566, + "num_tokens": 6990190845.0, + "step": 13674 + }, + { + "epoch": 3.6979448350459707, + "grad_norm": 2.75, + "learning_rate": 0.005016978968216392, + "loss": 2.9774, + "mean_token_accuracy": 0.4034090042114258, + "num_tokens": 6990715126.0, + "step": 13675 + }, + { + "epoch": 3.6982152514872904, + "grad_norm": 3.484375, + "learning_rate": 0.005015801360953012, + "loss": 2.804, + "mean_token_accuracy": 0.4392450451850891, + "num_tokens": 6991239405.0, + "step": 13676 + }, + { + "epoch": 3.69848566792861, + "grad_norm": 3.53125, + "learning_rate": 0.005014623937302447, + "loss": 3.0023, + "mean_token_accuracy": 0.4001023471355438, + "num_tokens": 6991763634.0, + "step": 13677 + }, + { + "epoch": 3.6987560843699296, + "grad_norm": 3.890625, + "learning_rate": 0.005013446697300823, + "loss": 2.8603, + "mean_token_accuracy": 0.4455728530883789, + "num_tokens": 6992287817.0, + "step": 13678 + }, + { + "epoch": 3.6990265008112493, + "grad_norm": 3.1875, + "learning_rate": 0.005012269640984258, + "loss": 2.9477, + "mean_token_accuracy": 0.414397269487381, + "num_tokens": 6992812039.0, + "step": 13679 + }, + { + "epoch": 3.699296917252569, + "grad_norm": 3.515625, + "learning_rate": 0.005011092768388868, + "loss": 3.0042, + "mean_token_accuracy": 0.42483818531036377, + "num_tokens": 6993336263.0, + "step": 13680 + }, + { + "epoch": 3.6995673336938886, + "grad_norm": 2.96875, + "learning_rate": 0.005009916079550767, + "loss": 3.0498, + "mean_token_accuracy": 0.42476075887680054, + "num_tokens": 6993860438.0, + "step": 13681 + }, + { + "epoch": 3.699837750135208, + "grad_norm": 3.265625, + "learning_rate": 0.005008739574506053, + "loss": 3.0253, + "mean_token_accuracy": 0.4223855137825012, + "num_tokens": 6994384636.0, + "step": 13682 + }, + { + "epoch": 3.700108166576528, + "grad_norm": 2.421875, + "learning_rate": 0.0050075632532908315, + "loss": 2.7427, + "mean_token_accuracy": 0.40735894441604614, + "num_tokens": 6994908911.0, + "step": 13683 + }, + { + "epoch": 3.7003785830178475, + "grad_norm": 3.0625, + "learning_rate": 0.005006387115941187, + "loss": 2.9702, + "mean_token_accuracy": 0.4171748161315918, + "num_tokens": 6995421939.0, + "step": 13684 + }, + { + "epoch": 3.700648999459167, + "grad_norm": 3.609375, + "learning_rate": 0.0050052111624932175, + "loss": 2.8485, + "mean_token_accuracy": 0.43080848455429077, + "num_tokens": 6995946113.0, + "step": 13685 + }, + { + "epoch": 3.700919415900487, + "grad_norm": 3.171875, + "learning_rate": 0.005004035392982994, + "loss": 3.0494, + "mean_token_accuracy": 0.39700132608413696, + "num_tokens": 6996470300.0, + "step": 13686 + }, + { + "epoch": 3.7011898323418064, + "grad_norm": 4.15625, + "learning_rate": 0.005002859807446601, + "loss": 2.8799, + "mean_token_accuracy": 0.45069897174835205, + "num_tokens": 6996917275.0, + "step": 13687 + }, + { + "epoch": 3.701460248783126, + "grad_norm": 3.125, + "learning_rate": 0.005001684405920104, + "loss": 2.8511, + "mean_token_accuracy": 0.4192901849746704, + "num_tokens": 6997413600.0, + "step": 13688 + }, + { + "epoch": 3.7017306652244457, + "grad_norm": 2.703125, + "learning_rate": 0.005000509188439571, + "loss": 2.9166, + "mean_token_accuracy": 0.4266449511051178, + "num_tokens": 6997937823.0, + "step": 13689 + }, + { + "epoch": 3.7020010816657654, + "grad_norm": 3.78125, + "learning_rate": 0.004999334155041059, + "loss": 3.0462, + "mean_token_accuracy": 0.4146565794944763, + "num_tokens": 6998461862.0, + "step": 13690 + }, + { + "epoch": 3.702271498107085, + "grad_norm": 12.3125, + "learning_rate": 0.004998159305760619, + "loss": 10.1018, + "mean_token_accuracy": 0.007066684775054455, + "num_tokens": 6998986091.0, + "step": 13691 + }, + { + "epoch": 3.7025419145484046, + "grad_norm": 7.0625, + "learning_rate": 0.0049969846406343055, + "loss": 3.0274, + "mean_token_accuracy": 0.406322181224823, + "num_tokens": 6999510288.0, + "step": 13692 + }, + { + "epoch": 3.7028123309897243, + "grad_norm": 3.46875, + "learning_rate": 0.004995810159698157, + "loss": 2.6901, + "mean_token_accuracy": 0.4525715410709381, + "num_tokens": 7000034552.0, + "step": 13693 + }, + { + "epoch": 3.703082747431044, + "grad_norm": 2.59375, + "learning_rate": 0.004994635862988207, + "loss": 2.7379, + "mean_token_accuracy": 0.4386383891105652, + "num_tokens": 7000558688.0, + "step": 13694 + }, + { + "epoch": 3.7033531638723636, + "grad_norm": 2.921875, + "learning_rate": 0.004993461750540492, + "loss": 2.9149, + "mean_token_accuracy": 0.43736472725868225, + "num_tokens": 7001082875.0, + "step": 13695 + }, + { + "epoch": 3.703623580313683, + "grad_norm": 3.625, + "learning_rate": 0.004992287822391032, + "loss": 2.8132, + "mean_token_accuracy": 0.4451483488082886, + "num_tokens": 7001607072.0, + "step": 13696 + }, + { + "epoch": 3.703893996755003, + "grad_norm": 2.484375, + "learning_rate": 0.004991114078575851, + "loss": 3.1189, + "mean_token_accuracy": 0.41926074028015137, + "num_tokens": 7002127569.0, + "step": 13697 + }, + { + "epoch": 3.7041644131963225, + "grad_norm": 3.765625, + "learning_rate": 0.00498994051913096, + "loss": 3.0469, + "mean_token_accuracy": 0.4321051239967346, + "num_tokens": 7002611143.0, + "step": 13698 + }, + { + "epoch": 3.704434829637642, + "grad_norm": 3.21875, + "learning_rate": 0.00498876714409237, + "loss": 2.9712, + "mean_token_accuracy": 0.4167026877403259, + "num_tokens": 7003135426.0, + "step": 13699 + }, + { + "epoch": 3.7047052460789613, + "grad_norm": 3.625, + "learning_rate": 0.004987593953496079, + "loss": 3.0049, + "mean_token_accuracy": 0.41425514221191406, + "num_tokens": 7003659701.0, + "step": 13700 + }, + { + "epoch": 3.7049756625202814, + "grad_norm": 2.828125, + "learning_rate": 0.00498642094737809, + "loss": 2.9219, + "mean_token_accuracy": 0.41395437717437744, + "num_tokens": 7004183820.0, + "step": 13701 + }, + { + "epoch": 3.7052460789616006, + "grad_norm": 2.8125, + "learning_rate": 0.004985248125774387, + "loss": 2.9312, + "mean_token_accuracy": 0.4329380989074707, + "num_tokens": 7004665642.0, + "step": 13702 + }, + { + "epoch": 3.7055164954029207, + "grad_norm": 3.109375, + "learning_rate": 0.004984075488720964, + "loss": 2.919, + "mean_token_accuracy": 0.4305580258369446, + "num_tokens": 7005189918.0, + "step": 13703 + }, + { + "epoch": 3.70578691184424, + "grad_norm": 3.0, + "learning_rate": 0.004982903036253797, + "loss": 2.8685, + "mean_token_accuracy": 0.4346258044242859, + "num_tokens": 7005690861.0, + "step": 13704 + }, + { + "epoch": 3.70605732828556, + "grad_norm": 3.5, + "learning_rate": 0.004981730768408857, + "loss": 2.9544, + "mean_token_accuracy": 0.4450453519821167, + "num_tokens": 7006188414.0, + "step": 13705 + }, + { + "epoch": 3.706327744726879, + "grad_norm": 2.875, + "learning_rate": 0.004980558685222117, + "loss": 2.9027, + "mean_token_accuracy": 0.4331718385219574, + "num_tokens": 7006712499.0, + "step": 13706 + }, + { + "epoch": 3.7065981611681993, + "grad_norm": 3.328125, + "learning_rate": 0.00497938678672954, + "loss": 2.732, + "mean_token_accuracy": 0.4224281311035156, + "num_tokens": 7007236709.0, + "step": 13707 + }, + { + "epoch": 3.7068685776095185, + "grad_norm": 2.5, + "learning_rate": 0.004978215072967078, + "loss": 2.9107, + "mean_token_accuracy": 0.4345995783805847, + "num_tokens": 7007760920.0, + "step": 13708 + }, + { + "epoch": 3.707138994050838, + "grad_norm": 3.046875, + "learning_rate": 0.00497704354397069, + "loss": 2.886, + "mean_token_accuracy": 0.4307250380516052, + "num_tokens": 7008271299.0, + "step": 13709 + }, + { + "epoch": 3.7074094104921578, + "grad_norm": 3.140625, + "learning_rate": 0.004975872199776319, + "loss": 2.9523, + "mean_token_accuracy": 0.450518399477005, + "num_tokens": 7008688764.0, + "step": 13710 + }, + { + "epoch": 3.7076798269334774, + "grad_norm": 11.5, + "learning_rate": 0.004974701040419901, + "loss": 10.4329, + "mean_token_accuracy": 0.0006438601994886994, + "num_tokens": 7009212804.0, + "step": 13711 + }, + { + "epoch": 3.707950243374797, + "grad_norm": 9.25, + "learning_rate": 0.0049735300659373765, + "loss": 2.9699, + "mean_token_accuracy": 0.4363270401954651, + "num_tokens": 7009672835.0, + "step": 13712 + }, + { + "epoch": 3.7082206598161167, + "grad_norm": 2.796875, + "learning_rate": 0.004972359276364671, + "loss": 2.8878, + "mean_token_accuracy": 0.436389684677124, + "num_tokens": 7010142274.0, + "step": 13713 + }, + { + "epoch": 3.7084910762574363, + "grad_norm": 3.046875, + "learning_rate": 0.004971188671737708, + "loss": 2.8137, + "mean_token_accuracy": 0.43487098813056946, + "num_tokens": 7010666521.0, + "step": 13714 + }, + { + "epoch": 3.708761492698756, + "grad_norm": 2.59375, + "learning_rate": 0.004970018252092409, + "loss": 2.7791, + "mean_token_accuracy": 0.4390105605125427, + "num_tokens": 7011190677.0, + "step": 13715 + }, + { + "epoch": 3.7090319091400756, + "grad_norm": 3.140625, + "learning_rate": 0.004968848017464683, + "loss": 3.0482, + "mean_token_accuracy": 0.44442012906074524, + "num_tokens": 7011653272.0, + "step": 13716 + }, + { + "epoch": 3.7093023255813953, + "grad_norm": 3.25, + "learning_rate": 0.004967677967890435, + "loss": 2.8498, + "mean_token_accuracy": 0.4380144774913788, + "num_tokens": 7012142924.0, + "step": 13717 + }, + { + "epoch": 3.709572742022715, + "grad_norm": 3.859375, + "learning_rate": 0.004966508103405568, + "loss": 2.9141, + "mean_token_accuracy": 0.43014800548553467, + "num_tokens": 7012667071.0, + "step": 13718 + }, + { + "epoch": 3.7098431584640346, + "grad_norm": 3.03125, + "learning_rate": 0.004965338424045973, + "loss": 2.6771, + "mean_token_accuracy": 0.4478178024291992, + "num_tokens": 7013191335.0, + "step": 13719 + }, + { + "epoch": 3.710113574905354, + "grad_norm": 3.28125, + "learning_rate": 0.004964168929847545, + "loss": 2.7193, + "mean_token_accuracy": 0.43606066703796387, + "num_tokens": 7013715574.0, + "step": 13720 + }, + { + "epoch": 3.710383991346674, + "grad_norm": 2.796875, + "learning_rate": 0.0049629996208461645, + "loss": 2.9583, + "mean_token_accuracy": 0.42287522554397583, + "num_tokens": 7014239821.0, + "step": 13721 + }, + { + "epoch": 3.7106544077879935, + "grad_norm": 3.265625, + "learning_rate": 0.004961830497077706, + "loss": 2.8441, + "mean_token_accuracy": 0.4235385060310364, + "num_tokens": 7014713612.0, + "step": 13722 + }, + { + "epoch": 3.710924824229313, + "grad_norm": 2.953125, + "learning_rate": 0.004960661558578049, + "loss": 2.8396, + "mean_token_accuracy": 0.43031013011932373, + "num_tokens": 7015237668.0, + "step": 13723 + }, + { + "epoch": 3.7111952406706328, + "grad_norm": 3.46875, + "learning_rate": 0.004959492805383056, + "loss": 2.9559, + "mean_token_accuracy": 0.4263809323310852, + "num_tokens": 7015761921.0, + "step": 13724 + }, + { + "epoch": 3.7114656571119524, + "grad_norm": 3.171875, + "learning_rate": 0.004958324237528584, + "loss": 2.8016, + "mean_token_accuracy": 0.4334668517112732, + "num_tokens": 7016285984.0, + "step": 13725 + }, + { + "epoch": 3.711736073553272, + "grad_norm": 2.65625, + "learning_rate": 0.004957155855050494, + "loss": 2.7722, + "mean_token_accuracy": 0.4507240653038025, + "num_tokens": 7016810151.0, + "step": 13726 + }, + { + "epoch": 3.7120064899945917, + "grad_norm": 3.21875, + "learning_rate": 0.004955987657984631, + "loss": 3.0114, + "mean_token_accuracy": 0.4164940416812897, + "num_tokens": 7017334377.0, + "step": 13727 + }, + { + "epoch": 3.7122769064359114, + "grad_norm": 3.296875, + "learning_rate": 0.004954819646366841, + "loss": 2.8273, + "mean_token_accuracy": 0.42464980483055115, + "num_tokens": 7017858565.0, + "step": 13728 + }, + { + "epoch": 3.712547322877231, + "grad_norm": 3.09375, + "learning_rate": 0.004953651820232965, + "loss": 2.7939, + "mean_token_accuracy": 0.4404948353767395, + "num_tokens": 7018372035.0, + "step": 13729 + }, + { + "epoch": 3.7128177393185506, + "grad_norm": 3.046875, + "learning_rate": 0.004952484179618832, + "loss": 2.5576, + "mean_token_accuracy": 0.44567182660102844, + "num_tokens": 7018896173.0, + "step": 13730 + }, + { + "epoch": 3.7130881557598703, + "grad_norm": 5.625, + "learning_rate": 0.004951316724560267, + "loss": 8.6543, + "mean_token_accuracy": 0.04043008014559746, + "num_tokens": 7019420330.0, + "step": 13731 + }, + { + "epoch": 3.71335857220119, + "grad_norm": 10.625, + "learning_rate": 0.004950149455093095, + "loss": 2.8851, + "mean_token_accuracy": 0.4377761483192444, + "num_tokens": 7019936479.0, + "step": 13732 + }, + { + "epoch": 3.7136289886425096, + "grad_norm": 2.96875, + "learning_rate": 0.004948982371253129, + "loss": 2.825, + "mean_token_accuracy": 0.44770634174346924, + "num_tokens": 7020439110.0, + "step": 13733 + }, + { + "epoch": 3.713899405083829, + "grad_norm": 2.53125, + "learning_rate": 0.004947815473076176, + "loss": 2.7601, + "mean_token_accuracy": 0.4339989423751831, + "num_tokens": 7020934636.0, + "step": 13734 + }, + { + "epoch": 3.714169821525149, + "grad_norm": 2.625, + "learning_rate": 0.004946648760598046, + "loss": 2.806, + "mean_token_accuracy": 0.4422317147254944, + "num_tokens": 7021458867.0, + "step": 13735 + }, + { + "epoch": 3.7144402379664685, + "grad_norm": 3.015625, + "learning_rate": 0.004945482233854532, + "loss": 2.8917, + "mean_token_accuracy": 0.45262759923934937, + "num_tokens": 7021937327.0, + "step": 13736 + }, + { + "epoch": 3.714710654407788, + "grad_norm": 3.109375, + "learning_rate": 0.004944315892881427, + "loss": 2.8048, + "mean_token_accuracy": 0.44440072774887085, + "num_tokens": 7022461551.0, + "step": 13737 + }, + { + "epoch": 3.714981070849108, + "grad_norm": 2.859375, + "learning_rate": 0.004943149737714522, + "loss": 2.9802, + "mean_token_accuracy": 0.4476023316383362, + "num_tokens": 7022880501.0, + "step": 13738 + }, + { + "epoch": 3.7152514872904274, + "grad_norm": 3.53125, + "learning_rate": 0.004941983768389595, + "loss": 2.9806, + "mean_token_accuracy": 0.4142216145992279, + "num_tokens": 7023349089.0, + "step": 13739 + }, + { + "epoch": 3.715521903731747, + "grad_norm": 3.140625, + "learning_rate": 0.004940817984942418, + "loss": 2.9513, + "mean_token_accuracy": 0.42932969331741333, + "num_tokens": 7023868059.0, + "step": 13740 + }, + { + "epoch": 3.7157923201730663, + "grad_norm": 3.3125, + "learning_rate": 0.004939652387408765, + "loss": 3.0547, + "mean_token_accuracy": 0.4047252833843231, + "num_tokens": 7024392209.0, + "step": 13741 + }, + { + "epoch": 3.7160627366143864, + "grad_norm": 2.953125, + "learning_rate": 0.0049384869758244, + "loss": 2.7755, + "mean_token_accuracy": 0.4476628005504608, + "num_tokens": 7024916402.0, + "step": 13742 + }, + { + "epoch": 3.7163331530557056, + "grad_norm": 2.875, + "learning_rate": 0.00493732175022508, + "loss": 2.8028, + "mean_token_accuracy": 0.42517128586769104, + "num_tokens": 7025440684.0, + "step": 13743 + }, + { + "epoch": 3.7166035694970256, + "grad_norm": 3.703125, + "learning_rate": 0.004936156710646559, + "loss": 2.7664, + "mean_token_accuracy": 0.43102380633354187, + "num_tokens": 7025964859.0, + "step": 13744 + }, + { + "epoch": 3.716873985938345, + "grad_norm": 2.921875, + "learning_rate": 0.00493499185712458, + "loss": 2.9847, + "mean_token_accuracy": 0.41308319568634033, + "num_tokens": 7026489137.0, + "step": 13745 + }, + { + "epoch": 3.717144402379665, + "grad_norm": 3.953125, + "learning_rate": 0.00493382718969489, + "loss": 2.9164, + "mean_token_accuracy": 0.4282211661338806, + "num_tokens": 7027013170.0, + "step": 13746 + }, + { + "epoch": 3.717414818820984, + "grad_norm": 2.71875, + "learning_rate": 0.004932662708393222, + "loss": 3.0019, + "mean_token_accuracy": 0.41774481534957886, + "num_tokens": 7027537321.0, + "step": 13747 + }, + { + "epoch": 3.717685235262304, + "grad_norm": 3.71875, + "learning_rate": 0.004931498413255301, + "loss": 2.9998, + "mean_token_accuracy": 0.42069387435913086, + "num_tokens": 7028053183.0, + "step": 13748 + }, + { + "epoch": 3.7179556517036234, + "grad_norm": 34.5, + "learning_rate": 0.004930334304316859, + "loss": 2.9553, + "mean_token_accuracy": 0.43192875385284424, + "num_tokens": 7028577460.0, + "step": 13749 + }, + { + "epoch": 3.718226068144943, + "grad_norm": 10.0625, + "learning_rate": 0.004929170381613609, + "loss": 2.873, + "mean_token_accuracy": 0.45355939865112305, + "num_tokens": 7029101622.0, + "step": 13750 + }, + { + "epoch": 3.7184964845862627, + "grad_norm": 127.0, + "learning_rate": 0.004928006645181263, + "loss": 10.0353, + "mean_token_accuracy": 0.029512491077184677, + "num_tokens": 7029625842.0, + "step": 13751 + }, + { + "epoch": 3.7187669010275823, + "grad_norm": 3.34375, + "learning_rate": 0.004926843095055532, + "loss": 3.0115, + "mean_token_accuracy": 0.42271989583969116, + "num_tokens": 7030150127.0, + "step": 13752 + }, + { + "epoch": 3.719037317468902, + "grad_norm": 2.84375, + "learning_rate": 0.0049256797312721146, + "loss": 2.663, + "mean_token_accuracy": 0.45204365253448486, + "num_tokens": 7030674139.0, + "step": 13753 + }, + { + "epoch": 3.7193077339102216, + "grad_norm": 3.21875, + "learning_rate": 0.0049245165538667045, + "loss": 2.8268, + "mean_token_accuracy": 0.41070669889450073, + "num_tokens": 7031175701.0, + "step": 13754 + }, + { + "epoch": 3.7195781503515413, + "grad_norm": 2.640625, + "learning_rate": 0.004923353562874995, + "loss": 2.8282, + "mean_token_accuracy": 0.4434160590171814, + "num_tokens": 7031699937.0, + "step": 13755 + }, + { + "epoch": 3.719848566792861, + "grad_norm": 2.953125, + "learning_rate": 0.0049221907583326665, + "loss": 3.0962, + "mean_token_accuracy": 0.40577432513237, + "num_tokens": 7032224218.0, + "step": 13756 + }, + { + "epoch": 3.7201189832341806, + "grad_norm": 2.859375, + "learning_rate": 0.004921028140275399, + "loss": 2.9001, + "mean_token_accuracy": 0.41490304470062256, + "num_tokens": 7032748449.0, + "step": 13757 + }, + { + "epoch": 3.7203893996755, + "grad_norm": 3.21875, + "learning_rate": 0.004919865708738868, + "loss": 2.8502, + "mean_token_accuracy": 0.4363354742527008, + "num_tokens": 7033272632.0, + "step": 13758 + }, + { + "epoch": 3.72065981611682, + "grad_norm": 3.0625, + "learning_rate": 0.004918703463758737, + "loss": 2.9898, + "mean_token_accuracy": 0.4331245720386505, + "num_tokens": 7033796829.0, + "step": 13759 + }, + { + "epoch": 3.7209302325581395, + "grad_norm": 3.53125, + "learning_rate": 0.004917541405370666, + "loss": 2.9382, + "mean_token_accuracy": 0.427889883518219, + "num_tokens": 7034320996.0, + "step": 13760 + }, + { + "epoch": 3.721200648999459, + "grad_norm": 2.75, + "learning_rate": 0.004916379533610314, + "loss": 2.8458, + "mean_token_accuracy": 0.44787612557411194, + "num_tokens": 7034753196.0, + "step": 13761 + }, + { + "epoch": 3.7214710654407788, + "grad_norm": 3.265625, + "learning_rate": 0.004915217848513329, + "loss": 2.9284, + "mean_token_accuracy": 0.4166290760040283, + "num_tokens": 7035277366.0, + "step": 13762 + }, + { + "epoch": 3.7217414818820984, + "grad_norm": 2.84375, + "learning_rate": 0.004914056350115352, + "loss": 2.9645, + "mean_token_accuracy": 0.4330848455429077, + "num_tokens": 7035801546.0, + "step": 13763 + }, + { + "epoch": 3.722011898323418, + "grad_norm": 3.28125, + "learning_rate": 0.004912895038452026, + "loss": 3.0547, + "mean_token_accuracy": 0.4115781784057617, + "num_tokens": 7036325808.0, + "step": 13764 + }, + { + "epoch": 3.7222823147647377, + "grad_norm": 2.796875, + "learning_rate": 0.004911733913558979, + "loss": 2.8212, + "mean_token_accuracy": 0.4338860809803009, + "num_tokens": 7036849989.0, + "step": 13765 + }, + { + "epoch": 3.7225527312060573, + "grad_norm": 3.265625, + "learning_rate": 0.004910572975471842, + "loss": 2.9387, + "mean_token_accuracy": 0.45197242498397827, + "num_tokens": 7037303551.0, + "step": 13766 + }, + { + "epoch": 3.722823147647377, + "grad_norm": 3.078125, + "learning_rate": 0.004909412224226235, + "loss": 2.9584, + "mean_token_accuracy": 0.43599724769592285, + "num_tokens": 7037827774.0, + "step": 13767 + }, + { + "epoch": 3.7230935640886966, + "grad_norm": 4.03125, + "learning_rate": 0.004908251659857769, + "loss": 2.869, + "mean_token_accuracy": 0.42199110984802246, + "num_tokens": 7038351894.0, + "step": 13768 + }, + { + "epoch": 3.7233639805300163, + "grad_norm": 3.109375, + "learning_rate": 0.004907091282402059, + "loss": 2.8169, + "mean_token_accuracy": 0.43987607955932617, + "num_tokens": 7038876143.0, + "step": 13769 + }, + { + "epoch": 3.723634396971336, + "grad_norm": 3.46875, + "learning_rate": 0.004905931091894705, + "loss": 2.6521, + "mean_token_accuracy": 0.4427418112754822, + "num_tokens": 7039400369.0, + "step": 13770 + }, + { + "epoch": 3.7239048134126556, + "grad_norm": 51.25, + "learning_rate": 0.0049047710883713055, + "loss": 12.7488, + "mean_token_accuracy": 0.008038075640797615, + "num_tokens": 7039924436.0, + "step": 13771 + }, + { + "epoch": 3.724175229853975, + "grad_norm": 8.375, + "learning_rate": 0.004903611271867456, + "loss": 2.6562, + "mean_token_accuracy": 0.4265400171279907, + "num_tokens": 7040448587.0, + "step": 13772 + }, + { + "epoch": 3.724445646295295, + "grad_norm": 2.984375, + "learning_rate": 0.004902451642418742, + "loss": 3.1652, + "mean_token_accuracy": 0.40681320428848267, + "num_tokens": 7040916490.0, + "step": 13773 + }, + { + "epoch": 3.7247160627366145, + "grad_norm": 2.484375, + "learning_rate": 0.004901292200060741, + "loss": 2.8663, + "mean_token_accuracy": 0.43388521671295166, + "num_tokens": 7041440646.0, + "step": 13774 + }, + { + "epoch": 3.724986479177934, + "grad_norm": 2.9375, + "learning_rate": 0.0049001329448290325, + "loss": 2.8408, + "mean_token_accuracy": 0.43978387117385864, + "num_tokens": 7041964865.0, + "step": 13775 + }, + { + "epoch": 3.7252568956192538, + "grad_norm": 3.03125, + "learning_rate": 0.004898973876759184, + "loss": 2.834, + "mean_token_accuracy": 0.42279863357543945, + "num_tokens": 7042489126.0, + "step": 13776 + }, + { + "epoch": 3.7255273120605734, + "grad_norm": 2.84375, + "learning_rate": 0.004897814995886755, + "loss": 3.0539, + "mean_token_accuracy": 0.4015117287635803, + "num_tokens": 7043013388.0, + "step": 13777 + }, + { + "epoch": 3.725797728501893, + "grad_norm": 2.90625, + "learning_rate": 0.004896656302247312, + "loss": 2.9851, + "mean_token_accuracy": 0.414201021194458, + "num_tokens": 7043537667.0, + "step": 13778 + }, + { + "epoch": 3.7260681449432127, + "grad_norm": 3.15625, + "learning_rate": 0.004895497795876402, + "loss": 2.7387, + "mean_token_accuracy": 0.43055737018585205, + "num_tokens": 7044061823.0, + "step": 13779 + }, + { + "epoch": 3.7263385613845323, + "grad_norm": 2.796875, + "learning_rate": 0.0048943394768095685, + "loss": 2.9434, + "mean_token_accuracy": 0.42327725887298584, + "num_tokens": 7044578023.0, + "step": 13780 + }, + { + "epoch": 3.726608977825852, + "grad_norm": 3.3125, + "learning_rate": 0.0048931813450823584, + "loss": 2.9432, + "mean_token_accuracy": 0.39504969120025635, + "num_tokens": 7045102209.0, + "step": 13781 + }, + { + "epoch": 3.726879394267171, + "grad_norm": 2.703125, + "learning_rate": 0.004892023400730304, + "loss": 3.0384, + "mean_token_accuracy": 0.4347533583641052, + "num_tokens": 7045564700.0, + "step": 13782 + }, + { + "epoch": 3.7271498107084913, + "grad_norm": 3.421875, + "learning_rate": 0.0048908656437889305, + "loss": 2.9277, + "mean_token_accuracy": 0.42614415287971497, + "num_tokens": 7046088949.0, + "step": 13783 + }, + { + "epoch": 3.7274202271498105, + "grad_norm": 2.546875, + "learning_rate": 0.004889708074293765, + "loss": 2.8068, + "mean_token_accuracy": 0.44080036878585815, + "num_tokens": 7046613191.0, + "step": 13784 + }, + { + "epoch": 3.7276906435911306, + "grad_norm": 3.15625, + "learning_rate": 0.004888550692280328, + "loss": 2.8339, + "mean_token_accuracy": 0.42806464433670044, + "num_tokens": 7047137444.0, + "step": 13785 + }, + { + "epoch": 3.7279610600324498, + "grad_norm": 3.28125, + "learning_rate": 0.004887393497784126, + "loss": 2.7255, + "mean_token_accuracy": 0.42463594675064087, + "num_tokens": 7047661675.0, + "step": 13786 + }, + { + "epoch": 3.72823147647377, + "grad_norm": 2.5, + "learning_rate": 0.004886236490840669, + "loss": 2.8922, + "mean_token_accuracy": 0.4354743957519531, + "num_tokens": 7048185707.0, + "step": 13787 + }, + { + "epoch": 3.728501892915089, + "grad_norm": 3.8125, + "learning_rate": 0.004885079671485454, + "loss": 3.1175, + "mean_token_accuracy": 0.41604605317115784, + "num_tokens": 7048709879.0, + "step": 13788 + }, + { + "epoch": 3.728772309356409, + "grad_norm": 2.84375, + "learning_rate": 0.004883923039753981, + "loss": 2.9783, + "mean_token_accuracy": 0.42831486463546753, + "num_tokens": 7049234159.0, + "step": 13789 + }, + { + "epoch": 3.7290427257977283, + "grad_norm": 2.75, + "learning_rate": 0.004882766595681733, + "loss": 2.6824, + "mean_token_accuracy": 0.44514554738998413, + "num_tokens": 7049758334.0, + "step": 13790 + }, + { + "epoch": 3.729313142239048, + "grad_norm": 185.0, + "learning_rate": 0.004881610339304195, + "loss": 12.6797, + "mean_token_accuracy": 0.0, + "num_tokens": 7050193762.0, + "step": 13791 + }, + { + "epoch": 3.7295835586803676, + "grad_norm": 4.75, + "learning_rate": 0.004880454270656846, + "loss": 2.9053, + "mean_token_accuracy": 0.44711437821388245, + "num_tokens": 7050681316.0, + "step": 13792 + }, + { + "epoch": 3.7298539751216873, + "grad_norm": 2.234375, + "learning_rate": 0.004879298389775158, + "loss": 2.983, + "mean_token_accuracy": 0.43308430910110474, + "num_tokens": 7051205602.0, + "step": 13793 + }, + { + "epoch": 3.730124391563007, + "grad_norm": 3.09375, + "learning_rate": 0.004878142696694591, + "loss": 2.7565, + "mean_token_accuracy": 0.4340769648551941, + "num_tokens": 7051696506.0, + "step": 13794 + }, + { + "epoch": 3.7303948080043265, + "grad_norm": 2.9375, + "learning_rate": 0.004876987191450612, + "loss": 2.7901, + "mean_token_accuracy": 0.43602412939071655, + "num_tokens": 7052220729.0, + "step": 13795 + }, + { + "epoch": 3.730665224445646, + "grad_norm": 2.5, + "learning_rate": 0.004875831874078674, + "loss": 2.7389, + "mean_token_accuracy": 0.43101251125335693, + "num_tokens": 7052744939.0, + "step": 13796 + }, + { + "epoch": 3.730935640886966, + "grad_norm": 2.84375, + "learning_rate": 0.00487467674461422, + "loss": 3.0261, + "mean_token_accuracy": 0.41487541794776917, + "num_tokens": 7053269131.0, + "step": 13797 + }, + { + "epoch": 3.7312060573282855, + "grad_norm": 2.828125, + "learning_rate": 0.004873521803092698, + "loss": 2.8807, + "mean_token_accuracy": 0.4110731780529022, + "num_tokens": 7053793348.0, + "step": 13798 + }, + { + "epoch": 3.731476473769605, + "grad_norm": 2.90625, + "learning_rate": 0.004872367049549546, + "loss": 2.8837, + "mean_token_accuracy": 0.4527776837348938, + "num_tokens": 7054317536.0, + "step": 13799 + }, + { + "epoch": 3.7317468902109248, + "grad_norm": 3.046875, + "learning_rate": 0.00487121248402019, + "loss": 2.6995, + "mean_token_accuracy": 0.4535493850708008, + "num_tokens": 7054808643.0, + "step": 13800 + }, + { + "epoch": 3.7320173066522444, + "grad_norm": 3.53125, + "learning_rate": 0.0048700581065400615, + "loss": 2.7491, + "mean_token_accuracy": 0.46973443031311035, + "num_tokens": 7055182042.0, + "step": 13801 + }, + { + "epoch": 3.732287723093564, + "grad_norm": 3.34375, + "learning_rate": 0.004868903917144577, + "loss": 2.827, + "mean_token_accuracy": 0.45756518840789795, + "num_tokens": 7055658562.0, + "step": 13802 + }, + { + "epoch": 3.7325581395348837, + "grad_norm": 2.796875, + "learning_rate": 0.004867749915869149, + "loss": 2.9576, + "mean_token_accuracy": 0.4282735586166382, + "num_tokens": 7056182779.0, + "step": 13803 + }, + { + "epoch": 3.7328285559762033, + "grad_norm": 3.109375, + "learning_rate": 0.00486659610274919, + "loss": 2.8734, + "mean_token_accuracy": 0.4391137659549713, + "num_tokens": 7056689380.0, + "step": 13804 + }, + { + "epoch": 3.733098972417523, + "grad_norm": 3.359375, + "learning_rate": 0.004865442477820099, + "loss": 2.9627, + "mean_token_accuracy": 0.41864168643951416, + "num_tokens": 7057213657.0, + "step": 13805 + }, + { + "epoch": 3.7333693888588426, + "grad_norm": 3.03125, + "learning_rate": 0.004864289041117271, + "loss": 2.7513, + "mean_token_accuracy": 0.4698101282119751, + "num_tokens": 7057737906.0, + "step": 13806 + }, + { + "epoch": 3.7336398053001623, + "grad_norm": 2.953125, + "learning_rate": 0.004863135792676102, + "loss": 2.9434, + "mean_token_accuracy": 0.4296738803386688, + "num_tokens": 7058262077.0, + "step": 13807 + }, + { + "epoch": 3.733910221741482, + "grad_norm": 3.265625, + "learning_rate": 0.004861982732531973, + "loss": 2.8158, + "mean_token_accuracy": 0.4255406856536865, + "num_tokens": 7058786109.0, + "step": 13808 + }, + { + "epoch": 3.7341806381828015, + "grad_norm": 2.515625, + "learning_rate": 0.004860829860720263, + "loss": 2.9124, + "mean_token_accuracy": 0.4238191246986389, + "num_tokens": 7059310376.0, + "step": 13809 + }, + { + "epoch": 3.734451054624121, + "grad_norm": 3.796875, + "learning_rate": 0.00485967717727635, + "loss": 2.9022, + "mean_token_accuracy": 0.4197719693183899, + "num_tokens": 7059834565.0, + "step": 13810 + }, + { + "epoch": 3.734721471065441, + "grad_norm": 8.0, + "learning_rate": 0.004858524682235596, + "loss": 10.6443, + "mean_token_accuracy": 6.381537787092384e-06, + "num_tokens": 7060358839.0, + "step": 13811 + }, + { + "epoch": 3.7349918875067605, + "grad_norm": 6.84375, + "learning_rate": 0.004857372375633368, + "loss": 2.953, + "mean_token_accuracy": 0.410398006439209, + "num_tokens": 7060883010.0, + "step": 13812 + }, + { + "epoch": 3.73526230394808, + "grad_norm": 2.40625, + "learning_rate": 0.004856220257505016, + "loss": 2.9442, + "mean_token_accuracy": 0.42180925607681274, + "num_tokens": 7061364592.0, + "step": 13813 + }, + { + "epoch": 3.7355327203893998, + "grad_norm": 3.171875, + "learning_rate": 0.004855068327885895, + "loss": 3.1055, + "mean_token_accuracy": 0.41386401653289795, + "num_tokens": 7061888858.0, + "step": 13814 + }, + { + "epoch": 3.7358031368307194, + "grad_norm": 3.84375, + "learning_rate": 0.004853916586811352, + "loss": 2.9641, + "mean_token_accuracy": 0.4194130003452301, + "num_tokens": 7062413117.0, + "step": 13815 + }, + { + "epoch": 3.736073553272039, + "grad_norm": 24.25, + "learning_rate": 0.0048527650343167215, + "loss": 2.7997, + "mean_token_accuracy": 0.44465935230255127, + "num_tokens": 7062937351.0, + "step": 13816 + }, + { + "epoch": 3.7363439697133587, + "grad_norm": 4.4375, + "learning_rate": 0.004851613670437336, + "loss": 3.0446, + "mean_token_accuracy": 0.4235828220844269, + "num_tokens": 7063461626.0, + "step": 13817 + }, + { + "epoch": 3.7366143861546783, + "grad_norm": 3.0625, + "learning_rate": 0.004850462495208527, + "loss": 2.977, + "mean_token_accuracy": 0.4245541989803314, + "num_tokens": 7063949205.0, + "step": 13818 + }, + { + "epoch": 3.736884802595998, + "grad_norm": 3.453125, + "learning_rate": 0.004849311508665612, + "loss": 2.7883, + "mean_token_accuracy": 0.43817228078842163, + "num_tokens": 7064473372.0, + "step": 13819 + }, + { + "epoch": 3.7371552190373176, + "grad_norm": 3.21875, + "learning_rate": 0.004848160710843907, + "loss": 2.9015, + "mean_token_accuracy": 0.4283362627029419, + "num_tokens": 7064997571.0, + "step": 13820 + }, + { + "epoch": 3.7374256354786373, + "grad_norm": 4.25, + "learning_rate": 0.004847010101778724, + "loss": 2.9003, + "mean_token_accuracy": 0.41011562943458557, + "num_tokens": 7065521829.0, + "step": 13821 + }, + { + "epoch": 3.737696051919957, + "grad_norm": 2.21875, + "learning_rate": 0.004845859681505366, + "loss": 2.9569, + "mean_token_accuracy": 0.4222087264060974, + "num_tokens": 7066045980.0, + "step": 13822 + }, + { + "epoch": 3.737966468361276, + "grad_norm": 3.484375, + "learning_rate": 0.004844709450059129, + "loss": 2.7167, + "mean_token_accuracy": 0.46167871356010437, + "num_tokens": 7066511156.0, + "step": 13823 + }, + { + "epoch": 3.738236884802596, + "grad_norm": 3.40625, + "learning_rate": 0.004843559407475311, + "loss": 2.8303, + "mean_token_accuracy": 0.43505650758743286, + "num_tokens": 7067035321.0, + "step": 13824 + }, + { + "epoch": 3.7385073012439154, + "grad_norm": 3.9375, + "learning_rate": 0.004842409553789194, + "loss": 2.7791, + "mean_token_accuracy": 0.4288114309310913, + "num_tokens": 7067559467.0, + "step": 13825 + }, + { + "epoch": 3.7387777176852355, + "grad_norm": 3.390625, + "learning_rate": 0.004841259889036058, + "loss": 2.8651, + "mean_token_accuracy": 0.4435999393463135, + "num_tokens": 7068083666.0, + "step": 13826 + }, + { + "epoch": 3.7390481341265547, + "grad_norm": 3.875, + "learning_rate": 0.00484011041325118, + "loss": 2.9188, + "mean_token_accuracy": 0.45016342401504517, + "num_tokens": 7068545660.0, + "step": 13827 + }, + { + "epoch": 3.7393185505678748, + "grad_norm": 3.234375, + "learning_rate": 0.004838961126469832, + "loss": 2.9448, + "mean_token_accuracy": 0.4402931034564972, + "num_tokens": 7069061988.0, + "step": 13828 + }, + { + "epoch": 3.739588967009194, + "grad_norm": 3.203125, + "learning_rate": 0.004837812028727273, + "loss": 2.8466, + "mean_token_accuracy": 0.4299447238445282, + "num_tokens": 7069586268.0, + "step": 13829 + }, + { + "epoch": 3.739859383450514, + "grad_norm": 2.78125, + "learning_rate": 0.004836663120058766, + "loss": 3.0055, + "mean_token_accuracy": 0.42393481731414795, + "num_tokens": 7070089550.0, + "step": 13830 + }, + { + "epoch": 3.7401297998918333, + "grad_norm": 8.6875, + "learning_rate": 0.004835514400499558, + "loss": 10.9071, + "mean_token_accuracy": 0.0, + "num_tokens": 7070613728.0, + "step": 13831 + }, + { + "epoch": 3.740400216333153, + "grad_norm": 5.5, + "learning_rate": 0.004834365870084895, + "loss": 2.8965, + "mean_token_accuracy": 0.4200971722602844, + "num_tokens": 7071072759.0, + "step": 13832 + }, + { + "epoch": 3.7406706327744725, + "grad_norm": 2.53125, + "learning_rate": 0.004833217528850021, + "loss": 2.9402, + "mean_token_accuracy": 0.3888978064060211, + "num_tokens": 7071596956.0, + "step": 13833 + }, + { + "epoch": 3.740941049215792, + "grad_norm": 2.359375, + "learning_rate": 0.004832069376830167, + "loss": 2.9601, + "mean_token_accuracy": 0.4310157001018524, + "num_tokens": 7072065527.0, + "step": 13834 + }, + { + "epoch": 3.741211465657112, + "grad_norm": 4.0, + "learning_rate": 0.0048309214140605655, + "loss": 2.5969, + "mean_token_accuracy": 0.47793978452682495, + "num_tokens": 7072589628.0, + "step": 13835 + }, + { + "epoch": 3.7414818820984315, + "grad_norm": 2.375, + "learning_rate": 0.004829773640576437, + "loss": 2.9473, + "mean_token_accuracy": 0.42696526646614075, + "num_tokens": 7073113820.0, + "step": 13836 + }, + { + "epoch": 3.741752298539751, + "grad_norm": 5.75, + "learning_rate": 0.004828626056412997, + "loss": 2.63, + "mean_token_accuracy": 0.4632754325866699, + "num_tokens": 7073638030.0, + "step": 13837 + }, + { + "epoch": 3.7420227149810708, + "grad_norm": 2.46875, + "learning_rate": 0.004827478661605459, + "loss": 2.7956, + "mean_token_accuracy": 0.4400421380996704, + "num_tokens": 7074114144.0, + "step": 13838 + }, + { + "epoch": 3.7422931314223904, + "grad_norm": 3.484375, + "learning_rate": 0.004826331456189029, + "loss": 2.8452, + "mean_token_accuracy": 0.44151800870895386, + "num_tokens": 7074638317.0, + "step": 13839 + }, + { + "epoch": 3.74256354786371, + "grad_norm": 4.1875, + "learning_rate": 0.004825184440198903, + "loss": 2.9076, + "mean_token_accuracy": 0.41565805673599243, + "num_tokens": 7075162569.0, + "step": 13840 + }, + { + "epoch": 3.7428339643050297, + "grad_norm": 2.90625, + "learning_rate": 0.0048240376136702776, + "loss": 2.9488, + "mean_token_accuracy": 0.4383903741836548, + "num_tokens": 7075642927.0, + "step": 13841 + }, + { + "epoch": 3.7431043807463493, + "grad_norm": 3.609375, + "learning_rate": 0.004822890976638343, + "loss": 3.0097, + "mean_token_accuracy": 0.42311620712280273, + "num_tokens": 7076160668.0, + "step": 13842 + }, + { + "epoch": 3.743374797187669, + "grad_norm": 2.84375, + "learning_rate": 0.004821744529138276, + "loss": 3.0677, + "mean_token_accuracy": 0.4225272536277771, + "num_tokens": 7076658477.0, + "step": 13843 + }, + { + "epoch": 3.7436452136289886, + "grad_norm": 2.953125, + "learning_rate": 0.0048205982712052585, + "loss": 2.7201, + "mean_token_accuracy": 0.47189727425575256, + "num_tokens": 7077117848.0, + "step": 13844 + }, + { + "epoch": 3.7439156300703083, + "grad_norm": 2.328125, + "learning_rate": 0.004819452202874459, + "loss": 2.8933, + "mean_token_accuracy": 0.41437816619873047, + "num_tokens": 7077622189.0, + "step": 13845 + }, + { + "epoch": 3.744186046511628, + "grad_norm": 3.453125, + "learning_rate": 0.004818306324181038, + "loss": 3.0474, + "mean_token_accuracy": 0.43536585569381714, + "num_tokens": 7078045549.0, + "step": 13846 + }, + { + "epoch": 3.7444564629529475, + "grad_norm": 3.109375, + "learning_rate": 0.0048171606351601615, + "loss": 2.8766, + "mean_token_accuracy": 0.43436798453330994, + "num_tokens": 7078524353.0, + "step": 13847 + }, + { + "epoch": 3.744726879394267, + "grad_norm": 2.59375, + "learning_rate": 0.0048160151358469785, + "loss": 2.9102, + "mean_token_accuracy": 0.43609321117401123, + "num_tokens": 7079048580.0, + "step": 13848 + }, + { + "epoch": 3.744997295835587, + "grad_norm": 15.9375, + "learning_rate": 0.004814869826276636, + "loss": 2.7036, + "mean_token_accuracy": 0.45260927081108093, + "num_tokens": 7079547419.0, + "step": 13849 + }, + { + "epoch": 3.7452677122769065, + "grad_norm": 4.75, + "learning_rate": 0.004813724706484278, + "loss": 3.0942, + "mean_token_accuracy": 0.4168725609779358, + "num_tokens": 7080071595.0, + "step": 13850 + }, + { + "epoch": 3.745538128718226, + "grad_norm": 14.0, + "learning_rate": 0.004812579776505039, + "loss": 13.6606, + "mean_token_accuracy": 0.009449781849980354, + "num_tokens": 7080536622.0, + "step": 13851 + }, + { + "epoch": 3.7458085451595458, + "grad_norm": 5.0, + "learning_rate": 0.004811435036374046, + "loss": 2.9377, + "mean_token_accuracy": 0.4169411063194275, + "num_tokens": 7081060724.0, + "step": 13852 + }, + { + "epoch": 3.7460789616008654, + "grad_norm": 2.546875, + "learning_rate": 0.004810290486126428, + "loss": 3.0561, + "mean_token_accuracy": 0.3950599730014801, + "num_tokens": 7081584872.0, + "step": 13853 + }, + { + "epoch": 3.746349378042185, + "grad_norm": 2.71875, + "learning_rate": 0.004809146125797298, + "loss": 2.7819, + "mean_token_accuracy": 0.4419671297073364, + "num_tokens": 7082109139.0, + "step": 13854 + }, + { + "epoch": 3.7466197944835047, + "grad_norm": 4.0, + "learning_rate": 0.004808001955421774, + "loss": 2.9321, + "mean_token_accuracy": 0.4491812288761139, + "num_tokens": 7082593280.0, + "step": 13855 + }, + { + "epoch": 3.7468902109248243, + "grad_norm": 3.3125, + "learning_rate": 0.004806857975034956, + "loss": 2.9153, + "mean_token_accuracy": 0.43418100476264954, + "num_tokens": 7083117550.0, + "step": 13856 + }, + { + "epoch": 3.747160627366144, + "grad_norm": 3.453125, + "learning_rate": 0.004805714184671951, + "loss": 2.8371, + "mean_token_accuracy": 0.42884916067123413, + "num_tokens": 7083601438.0, + "step": 13857 + }, + { + "epoch": 3.7474310438074636, + "grad_norm": 2.65625, + "learning_rate": 0.004804570584367849, + "loss": 2.8752, + "mean_token_accuracy": 0.42111775279045105, + "num_tokens": 7084125549.0, + "step": 13858 + }, + { + "epoch": 3.7477014602487833, + "grad_norm": 3.484375, + "learning_rate": 0.004803427174157743, + "loss": 2.6115, + "mean_token_accuracy": 0.48529955744743347, + "num_tokens": 7084649832.0, + "step": 13859 + }, + { + "epoch": 3.747971876690103, + "grad_norm": 2.515625, + "learning_rate": 0.004802283954076712, + "loss": 2.9077, + "mean_token_accuracy": 0.45162808895111084, + "num_tokens": 7085117930.0, + "step": 13860 + }, + { + "epoch": 3.7482422931314225, + "grad_norm": 3.03125, + "learning_rate": 0.004801140924159838, + "loss": 2.7187, + "mean_token_accuracy": 0.43256181478500366, + "num_tokens": 7085642035.0, + "step": 13861 + }, + { + "epoch": 3.748512709572742, + "grad_norm": 2.65625, + "learning_rate": 0.004799998084442189, + "loss": 2.8953, + "mean_token_accuracy": 0.44144681096076965, + "num_tokens": 7086142681.0, + "step": 13862 + }, + { + "epoch": 3.748783126014062, + "grad_norm": 2.5625, + "learning_rate": 0.0047988554349588305, + "loss": 2.8076, + "mean_token_accuracy": 0.4351925849914551, + "num_tokens": 7086646916.0, + "step": 13863 + }, + { + "epoch": 3.749053542455381, + "grad_norm": 2.609375, + "learning_rate": 0.004797712975744826, + "loss": 2.9286, + "mean_token_accuracy": 0.4203856587409973, + "num_tokens": 7087171147.0, + "step": 13864 + }, + { + "epoch": 3.749323958896701, + "grad_norm": 2.84375, + "learning_rate": 0.0047965707068352275, + "loss": 2.7487, + "mean_token_accuracy": 0.42169642448425293, + "num_tokens": 7087695333.0, + "step": 13865 + }, + { + "epoch": 3.7495943753380203, + "grad_norm": 3.375, + "learning_rate": 0.00479542862826508, + "loss": 2.8983, + "mean_token_accuracy": 0.42460957169532776, + "num_tokens": 7088180473.0, + "step": 13866 + }, + { + "epoch": 3.7498647917793404, + "grad_norm": 3.171875, + "learning_rate": 0.004794286740069431, + "loss": 2.8574, + "mean_token_accuracy": 0.4335539937019348, + "num_tokens": 7088704678.0, + "step": 13867 + }, + { + "epoch": 3.7501352082206596, + "grad_norm": 2.796875, + "learning_rate": 0.004793145042283313, + "loss": 2.8411, + "mean_token_accuracy": 0.4389486014842987, + "num_tokens": 7089182224.0, + "step": 13868 + }, + { + "epoch": 3.7504056246619797, + "grad_norm": 2.859375, + "learning_rate": 0.004792003534941759, + "loss": 2.7249, + "mean_token_accuracy": 0.43224596977233887, + "num_tokens": 7089706407.0, + "step": 13869 + }, + { + "epoch": 3.750676041103299, + "grad_norm": 2.421875, + "learning_rate": 0.004790862218079791, + "loss": 2.8323, + "mean_token_accuracy": 0.43339917063713074, + "num_tokens": 7090230521.0, + "step": 13870 + }, + { + "epoch": 3.750946457544619, + "grad_norm": 5.875, + "learning_rate": 0.0047897210917324335, + "loss": 10.2919, + "mean_token_accuracy": 0.0005049232859164476, + "num_tokens": 7090754747.0, + "step": 13871 + }, + { + "epoch": 3.751216873985938, + "grad_norm": 6.5, + "learning_rate": 0.004788580155934694, + "loss": 2.9801, + "mean_token_accuracy": 0.41086921095848083, + "num_tokens": 7091278837.0, + "step": 13872 + }, + { + "epoch": 3.751487290427258, + "grad_norm": 3.078125, + "learning_rate": 0.004787439410721583, + "loss": 2.7838, + "mean_token_accuracy": 0.44739991426467896, + "num_tokens": 7091779249.0, + "step": 13873 + }, + { + "epoch": 3.7517577068685775, + "grad_norm": 3.296875, + "learning_rate": 0.004786298856128101, + "loss": 2.9523, + "mean_token_accuracy": 0.4072442650794983, + "num_tokens": 7092303518.0, + "step": 13874 + }, + { + "epoch": 3.752028123309897, + "grad_norm": 2.96875, + "learning_rate": 0.004785158492189242, + "loss": 2.9645, + "mean_token_accuracy": 0.4078636169433594, + "num_tokens": 7092794038.0, + "step": 13875 + }, + { + "epoch": 3.7522985397512167, + "grad_norm": 3.78125, + "learning_rate": 0.0047840183189399995, + "loss": 3.0818, + "mean_token_accuracy": 0.43176665902137756, + "num_tokens": 7093305376.0, + "step": 13876 + }, + { + "epoch": 3.7525689561925364, + "grad_norm": 3.796875, + "learning_rate": 0.004782878336415354, + "loss": 2.7012, + "mean_token_accuracy": 0.45415812730789185, + "num_tokens": 7093829512.0, + "step": 13877 + }, + { + "epoch": 3.752839372633856, + "grad_norm": 2.96875, + "learning_rate": 0.004781738544650283, + "loss": 3.0732, + "mean_token_accuracy": 0.4042893648147583, + "num_tokens": 7094353784.0, + "step": 13878 + }, + { + "epoch": 3.7531097890751757, + "grad_norm": 3.625, + "learning_rate": 0.004780598943679762, + "loss": 3.0395, + "mean_token_accuracy": 0.4053207039833069, + "num_tokens": 7094846033.0, + "step": 13879 + }, + { + "epoch": 3.7533802055164953, + "grad_norm": 2.953125, + "learning_rate": 0.0047794595335387555, + "loss": 2.9991, + "mean_token_accuracy": 0.42637163400650024, + "num_tokens": 7095370173.0, + "step": 13880 + }, + { + "epoch": 3.753650621957815, + "grad_norm": 3.65625, + "learning_rate": 0.004778320314262222, + "loss": 2.933, + "mean_token_accuracy": 0.4235789179801941, + "num_tokens": 7095894363.0, + "step": 13881 + }, + { + "epoch": 3.7539210383991346, + "grad_norm": 2.734375, + "learning_rate": 0.0047771812858851195, + "loss": 2.6596, + "mean_token_accuracy": 0.4272834062576294, + "num_tokens": 7096411382.0, + "step": 13882 + }, + { + "epoch": 3.7541914548404542, + "grad_norm": 2.625, + "learning_rate": 0.004776042448442393, + "loss": 2.8541, + "mean_token_accuracy": 0.4329039454460144, + "num_tokens": 7096935566.0, + "step": 13883 + }, + { + "epoch": 3.754461871281774, + "grad_norm": 3.078125, + "learning_rate": 0.004774903801968988, + "loss": 3.052, + "mean_token_accuracy": 0.42869502305984497, + "num_tokens": 7097459819.0, + "step": 13884 + }, + { + "epoch": 3.7547322877230935, + "grad_norm": 3.15625, + "learning_rate": 0.004773765346499845, + "loss": 3.0683, + "mean_token_accuracy": 0.40485554933547974, + "num_tokens": 7097947281.0, + "step": 13885 + }, + { + "epoch": 3.755002704164413, + "grad_norm": 3.078125, + "learning_rate": 0.004772627082069886, + "loss": 2.857, + "mean_token_accuracy": 0.42798882722854614, + "num_tokens": 7098459661.0, + "step": 13886 + }, + { + "epoch": 3.755273120605733, + "grad_norm": 2.703125, + "learning_rate": 0.004771489008714046, + "loss": 2.8028, + "mean_token_accuracy": 0.4300321042537689, + "num_tokens": 7098983917.0, + "step": 13887 + }, + { + "epoch": 3.7555435370470525, + "grad_norm": 2.875, + "learning_rate": 0.004770351126467239, + "loss": 2.923, + "mean_token_accuracy": 0.4045587182044983, + "num_tokens": 7099508055.0, + "step": 13888 + }, + { + "epoch": 3.755813953488372, + "grad_norm": 2.8125, + "learning_rate": 0.004769213435364379, + "loss": 2.9044, + "mean_token_accuracy": 0.4228147864341736, + "num_tokens": 7100032254.0, + "step": 13889 + }, + { + "epoch": 3.7560843699296917, + "grad_norm": 2.703125, + "learning_rate": 0.004768075935440375, + "loss": 3.1037, + "mean_token_accuracy": 0.40480121970176697, + "num_tokens": 7100527797.0, + "step": 13890 + }, + { + "epoch": 3.7563547863710114, + "grad_norm": 9.4375, + "learning_rate": 0.00476693862673013, + "loss": 10.9473, + "mean_token_accuracy": 0.032712168991565704, + "num_tokens": 7101022476.0, + "step": 13891 + }, + { + "epoch": 3.756625202812331, + "grad_norm": 5.25, + "learning_rate": 0.004765801509268533, + "loss": 2.6573, + "mean_token_accuracy": 0.4466686248779297, + "num_tokens": 7101546520.0, + "step": 13892 + }, + { + "epoch": 3.7568956192536507, + "grad_norm": 2.28125, + "learning_rate": 0.004764664583090484, + "loss": 2.7398, + "mean_token_accuracy": 0.43050605058670044, + "num_tokens": 7102070799.0, + "step": 13893 + }, + { + "epoch": 3.7571660356949703, + "grad_norm": 2.1875, + "learning_rate": 0.004763527848230862, + "loss": 2.9229, + "mean_token_accuracy": 0.42660820484161377, + "num_tokens": 7102595077.0, + "step": 13894 + }, + { + "epoch": 3.75743645213629, + "grad_norm": 4.0, + "learning_rate": 0.004762391304724543, + "loss": 2.96, + "mean_token_accuracy": 0.42596328258514404, + "num_tokens": 7103119200.0, + "step": 13895 + }, + { + "epoch": 3.7577068685776096, + "grad_norm": 3.8125, + "learning_rate": 0.004761254952606406, + "loss": 2.9502, + "mean_token_accuracy": 0.42660388350486755, + "num_tokens": 7103643413.0, + "step": 13896 + }, + { + "epoch": 3.7579772850189292, + "grad_norm": 3.484375, + "learning_rate": 0.00476011879191131, + "loss": 2.8959, + "mean_token_accuracy": 0.4256635904312134, + "num_tokens": 7104108250.0, + "step": 13897 + }, + { + "epoch": 3.758247701460249, + "grad_norm": 3.125, + "learning_rate": 0.00475898282267412, + "loss": 2.9418, + "mean_token_accuracy": 0.40271276235580444, + "num_tokens": 7104632469.0, + "step": 13898 + }, + { + "epoch": 3.7585181179015685, + "grad_norm": 3.625, + "learning_rate": 0.004757847044929694, + "loss": 2.9674, + "mean_token_accuracy": 0.4234495162963867, + "num_tokens": 7105156728.0, + "step": 13899 + }, + { + "epoch": 3.758788534342888, + "grad_norm": 3.296875, + "learning_rate": 0.004756711458712877, + "loss": 3.0154, + "mean_token_accuracy": 0.4150605797767639, + "num_tokens": 7105680908.0, + "step": 13900 + }, + { + "epoch": 3.759058950784208, + "grad_norm": 3.09375, + "learning_rate": 0.00475557606405851, + "loss": 2.8935, + "mean_token_accuracy": 0.4229685068130493, + "num_tokens": 7106205115.0, + "step": 13901 + }, + { + "epoch": 3.7593293672255275, + "grad_norm": 2.75, + "learning_rate": 0.004754440861001436, + "loss": 2.672, + "mean_token_accuracy": 0.4433210492134094, + "num_tokens": 7106729239.0, + "step": 13902 + }, + { + "epoch": 3.759599783666847, + "grad_norm": 9.0, + "learning_rate": 0.004753305849576484, + "loss": 2.6854, + "mean_token_accuracy": 0.45951423048973083, + "num_tokens": 7107253441.0, + "step": 13903 + }, + { + "epoch": 3.7598702001081667, + "grad_norm": 2.875, + "learning_rate": 0.004752171029818475, + "loss": 2.7155, + "mean_token_accuracy": 0.4277198314666748, + "num_tokens": 7107777686.0, + "step": 13904 + }, + { + "epoch": 3.760140616549486, + "grad_norm": 2.8125, + "learning_rate": 0.0047510364017622365, + "loss": 2.8431, + "mean_token_accuracy": 0.4389355778694153, + "num_tokens": 7108301962.0, + "step": 13905 + }, + { + "epoch": 3.760411032990806, + "grad_norm": 2.375, + "learning_rate": 0.004749901965442575, + "loss": 2.8412, + "mean_token_accuracy": 0.44276389479637146, + "num_tokens": 7108765026.0, + "step": 13906 + }, + { + "epoch": 3.7606814494321252, + "grad_norm": 2.828125, + "learning_rate": 0.004748767720894303, + "loss": 2.8445, + "mean_token_accuracy": 0.4341621696949005, + "num_tokens": 7109289225.0, + "step": 13907 + }, + { + "epoch": 3.7609518658734453, + "grad_norm": 2.984375, + "learning_rate": 0.004747633668152223, + "loss": 2.9019, + "mean_token_accuracy": 0.42318224906921387, + "num_tokens": 7109813482.0, + "step": 13908 + }, + { + "epoch": 3.7612222823147645, + "grad_norm": 3.234375, + "learning_rate": 0.004746499807251125, + "loss": 2.9384, + "mean_token_accuracy": 0.4225825369358063, + "num_tokens": 7110337759.0, + "step": 13909 + }, + { + "epoch": 3.7614926987560846, + "grad_norm": 3.203125, + "learning_rate": 0.004745366138225806, + "loss": 2.7923, + "mean_token_accuracy": 0.43809452652931213, + "num_tokens": 7110862004.0, + "step": 13910 + }, + { + "epoch": 3.761763115197404, + "grad_norm": 22.625, + "learning_rate": 0.004744232661111045, + "loss": 12.8811, + "mean_token_accuracy": 7.619663665536791e-05, + "num_tokens": 7111374482.0, + "step": 13911 + }, + { + "epoch": 3.762033531638724, + "grad_norm": 7.0, + "learning_rate": 0.0047430993759416265, + "loss": 2.9999, + "mean_token_accuracy": 0.3766244053840637, + "num_tokens": 7111896516.0, + "step": 13912 + }, + { + "epoch": 3.762303948080043, + "grad_norm": 2.21875, + "learning_rate": 0.004741966282752317, + "loss": 3.0577, + "mean_token_accuracy": 0.4259936213493347, + "num_tokens": 7112324661.0, + "step": 13913 + }, + { + "epoch": 3.7625743645213627, + "grad_norm": 2.609375, + "learning_rate": 0.004740833381577886, + "loss": 2.7869, + "mean_token_accuracy": 0.437887966632843, + "num_tokens": 7112848876.0, + "step": 13914 + }, + { + "epoch": 3.7628447809626824, + "grad_norm": 4.03125, + "learning_rate": 0.0047397006724530935, + "loss": 2.8838, + "mean_token_accuracy": 0.4212849736213684, + "num_tokens": 7113373043.0, + "step": 13915 + }, + { + "epoch": 3.763115197404002, + "grad_norm": 3.875, + "learning_rate": 0.004738568155412696, + "loss": 2.9299, + "mean_token_accuracy": 0.43206122517585754, + "num_tokens": 7113897312.0, + "step": 13916 + }, + { + "epoch": 3.7633856138453217, + "grad_norm": 3.953125, + "learning_rate": 0.004737435830491442, + "loss": 3.0839, + "mean_token_accuracy": 0.3985956907272339, + "num_tokens": 7114421484.0, + "step": 13917 + }, + { + "epoch": 3.7636560302866413, + "grad_norm": 3.828125, + "learning_rate": 0.004736303697724071, + "loss": 3.0158, + "mean_token_accuracy": 0.4288906157016754, + "num_tokens": 7114945668.0, + "step": 13918 + }, + { + "epoch": 3.763926446727961, + "grad_norm": 3.5625, + "learning_rate": 0.0047351717571453246, + "loss": 2.8, + "mean_token_accuracy": 0.414093017578125, + "num_tokens": 7115469866.0, + "step": 13919 + }, + { + "epoch": 3.7641968631692806, + "grad_norm": 2.421875, + "learning_rate": 0.004734040008789934, + "loss": 2.9222, + "mean_token_accuracy": 0.4264649748802185, + "num_tokens": 7115994039.0, + "step": 13920 + }, + { + "epoch": 3.7644672796106002, + "grad_norm": 3.234375, + "learning_rate": 0.004732908452692618, + "loss": 2.7384, + "mean_token_accuracy": 0.44413095712661743, + "num_tokens": 7116518188.0, + "step": 13921 + }, + { + "epoch": 3.76473769605192, + "grad_norm": 3.09375, + "learning_rate": 0.004731777088888106, + "loss": 2.6025, + "mean_token_accuracy": 0.459377646446228, + "num_tokens": 7117042431.0, + "step": 13922 + }, + { + "epoch": 3.7650081124932395, + "grad_norm": 2.84375, + "learning_rate": 0.004730645917411105, + "loss": 2.9228, + "mean_token_accuracy": 0.4254912734031677, + "num_tokens": 7117566541.0, + "step": 13923 + }, + { + "epoch": 3.765278528934559, + "grad_norm": 3.390625, + "learning_rate": 0.004729514938296322, + "loss": 2.7507, + "mean_token_accuracy": 0.44243693351745605, + "num_tokens": 7118090619.0, + "step": 13924 + }, + { + "epoch": 3.765548945375879, + "grad_norm": 15.6875, + "learning_rate": 0.004728384151578464, + "loss": 2.8872, + "mean_token_accuracy": 0.4310300350189209, + "num_tokens": 7118556697.0, + "step": 13925 + }, + { + "epoch": 3.7658193618171985, + "grad_norm": 4.125, + "learning_rate": 0.00472725355729222, + "loss": 2.7872, + "mean_token_accuracy": 0.4390750527381897, + "num_tokens": 7119080924.0, + "step": 13926 + }, + { + "epoch": 3.766089778258518, + "grad_norm": 2.8125, + "learning_rate": 0.004726123155472286, + "loss": 2.8894, + "mean_token_accuracy": 0.4367593228816986, + "num_tokens": 7119605205.0, + "step": 13927 + }, + { + "epoch": 3.7663601946998377, + "grad_norm": 3.171875, + "learning_rate": 0.004724992946153345, + "loss": 3.0562, + "mean_token_accuracy": 0.41325142979621887, + "num_tokens": 7120129406.0, + "step": 13928 + }, + { + "epoch": 3.7666306111411574, + "grad_norm": 3.78125, + "learning_rate": 0.004723862929370071, + "loss": 3.009, + "mean_token_accuracy": 0.43068769574165344, + "num_tokens": 7120649364.0, + "step": 13929 + }, + { + "epoch": 3.766901027582477, + "grad_norm": 2.515625, + "learning_rate": 0.004722733105157142, + "loss": 2.7015, + "mean_token_accuracy": 0.4408629536628723, + "num_tokens": 7121122896.0, + "step": 13930 + }, + { + "epoch": 3.7671714440237967, + "grad_norm": 3.828125, + "learning_rate": 0.004721603473549223, + "loss": 10.7405, + "mean_token_accuracy": 0.009788557887077332, + "num_tokens": 7121647175.0, + "step": 13931 + }, + { + "epoch": 3.7674418604651163, + "grad_norm": 4.875, + "learning_rate": 0.00472047403458097, + "loss": 2.9484, + "mean_token_accuracy": 0.4442215859889984, + "num_tokens": 7122171316.0, + "step": 13932 + }, + { + "epoch": 3.767712276906436, + "grad_norm": 2.234375, + "learning_rate": 0.004719344788287042, + "loss": 3.087, + "mean_token_accuracy": 0.41329091787338257, + "num_tokens": 7122695563.0, + "step": 13933 + }, + { + "epoch": 3.7679826933477556, + "grad_norm": 3.453125, + "learning_rate": 0.004718215734702087, + "loss": 2.7637, + "mean_token_accuracy": 0.41648995876312256, + "num_tokens": 7123219677.0, + "step": 13934 + }, + { + "epoch": 3.7682531097890752, + "grad_norm": 3.328125, + "learning_rate": 0.004717086873860743, + "loss": 2.4731, + "mean_token_accuracy": 0.45070430636405945, + "num_tokens": 7123743892.0, + "step": 13935 + }, + { + "epoch": 3.768523526230395, + "grad_norm": 3.546875, + "learning_rate": 0.004715958205797653, + "loss": 2.9469, + "mean_token_accuracy": 0.4297904372215271, + "num_tokens": 7124268152.0, + "step": 13936 + }, + { + "epoch": 3.7687939426717145, + "grad_norm": 3.4375, + "learning_rate": 0.004714829730547447, + "loss": 2.7154, + "mean_token_accuracy": 0.44975602626800537, + "num_tokens": 7124774482.0, + "step": 13937 + }, + { + "epoch": 3.769064359113034, + "grad_norm": 3.046875, + "learning_rate": 0.004713701448144745, + "loss": 2.7389, + "mean_token_accuracy": 0.45984089374542236, + "num_tokens": 7125298704.0, + "step": 13938 + }, + { + "epoch": 3.769334775554354, + "grad_norm": 13.4375, + "learning_rate": 0.004712573358624173, + "loss": 2.7843, + "mean_token_accuracy": 0.42719215154647827, + "num_tokens": 7125822855.0, + "step": 13939 + }, + { + "epoch": 3.7696051919956735, + "grad_norm": 3.78125, + "learning_rate": 0.004711445462020337, + "loss": 2.9738, + "mean_token_accuracy": 0.4207647144794464, + "num_tokens": 7126347082.0, + "step": 13940 + }, + { + "epoch": 3.769875608436993, + "grad_norm": 2.578125, + "learning_rate": 0.004710317758367846, + "loss": 2.8094, + "mean_token_accuracy": 0.4488564729690552, + "num_tokens": 7126854575.0, + "step": 13941 + }, + { + "epoch": 3.7701460248783127, + "grad_norm": 3.234375, + "learning_rate": 0.0047091902477013085, + "loss": 2.7666, + "mean_token_accuracy": 0.4402387738227844, + "num_tokens": 7127378841.0, + "step": 13942 + }, + { + "epoch": 3.7704164413196324, + "grad_norm": 2.640625, + "learning_rate": 0.004708062930055313, + "loss": 2.947, + "mean_token_accuracy": 0.434334933757782, + "num_tokens": 7127903044.0, + "step": 13943 + }, + { + "epoch": 3.770686857760952, + "grad_norm": 2.921875, + "learning_rate": 0.0047069358054644475, + "loss": 2.9595, + "mean_token_accuracy": 0.4174867272377014, + "num_tokens": 7128427320.0, + "step": 13944 + }, + { + "epoch": 3.7709572742022717, + "grad_norm": 2.71875, + "learning_rate": 0.0047058088739633, + "loss": 2.8089, + "mean_token_accuracy": 0.42451566457748413, + "num_tokens": 7128951452.0, + "step": 13945 + }, + { + "epoch": 3.771227690643591, + "grad_norm": 3.5, + "learning_rate": 0.004704682135586447, + "loss": 2.9383, + "mean_token_accuracy": 0.4134695529937744, + "num_tokens": 7129475637.0, + "step": 13946 + }, + { + "epoch": 3.771498107084911, + "grad_norm": 3.34375, + "learning_rate": 0.004703555590368457, + "loss": 2.8914, + "mean_token_accuracy": 0.4453597068786621, + "num_tokens": 7129999738.0, + "step": 13947 + }, + { + "epoch": 3.77176852352623, + "grad_norm": 2.75, + "learning_rate": 0.0047024292383439, + "loss": 2.7542, + "mean_token_accuracy": 0.44382962584495544, + "num_tokens": 7130480356.0, + "step": 13948 + }, + { + "epoch": 3.7720389399675502, + "grad_norm": 2.875, + "learning_rate": 0.004701303079547335, + "loss": 2.7368, + "mean_token_accuracy": 0.4283598065376282, + "num_tokens": 7131004450.0, + "step": 13949 + }, + { + "epoch": 3.7723093564088694, + "grad_norm": 2.84375, + "learning_rate": 0.004700177114013312, + "loss": 2.9544, + "mean_token_accuracy": 0.422434002161026, + "num_tokens": 7131528669.0, + "step": 13950 + }, + { + "epoch": 3.7725797728501895, + "grad_norm": 3.203125, + "learning_rate": 0.004699051341776383, + "loss": 10.6951, + "mean_token_accuracy": 1.0384072083979845e-05, + "num_tokens": 7132052874.0, + "step": 13951 + }, + { + "epoch": 3.7728501892915087, + "grad_norm": 5.78125, + "learning_rate": 0.004697925762871088, + "loss": 3.0032, + "mean_token_accuracy": 0.41013190150260925, + "num_tokens": 7132564259.0, + "step": 13952 + }, + { + "epoch": 3.773120605732829, + "grad_norm": 2.1875, + "learning_rate": 0.004696800377331965, + "loss": 2.8825, + "mean_token_accuracy": 0.43062910437583923, + "num_tokens": 7133088487.0, + "step": 13953 + }, + { + "epoch": 3.773391022174148, + "grad_norm": 3.265625, + "learning_rate": 0.004695675185193541, + "loss": 2.8192, + "mean_token_accuracy": 0.4107601046562195, + "num_tokens": 7133612595.0, + "step": 13954 + }, + { + "epoch": 3.7736614386154677, + "grad_norm": 2.6875, + "learning_rate": 0.004694550186490345, + "loss": 3.0421, + "mean_token_accuracy": 0.4160459339618683, + "num_tokens": 7134136835.0, + "step": 13955 + }, + { + "epoch": 3.7739318550567873, + "grad_norm": 3.640625, + "learning_rate": 0.004693425381256889, + "loss": 2.8365, + "mean_token_accuracy": 0.43678849935531616, + "num_tokens": 7134642750.0, + "step": 13956 + }, + { + "epoch": 3.774202271498107, + "grad_norm": 2.953125, + "learning_rate": 0.004692300769527692, + "loss": 2.851, + "mean_token_accuracy": 0.43766599893569946, + "num_tokens": 7135134970.0, + "step": 13957 + }, + { + "epoch": 3.7744726879394266, + "grad_norm": 3.171875, + "learning_rate": 0.0046911763513372545, + "loss": 3.0788, + "mean_token_accuracy": 0.4148544669151306, + "num_tokens": 7135659235.0, + "step": 13958 + }, + { + "epoch": 3.7747431043807462, + "grad_norm": 3.28125, + "learning_rate": 0.004690052126720082, + "loss": 2.7626, + "mean_token_accuracy": 0.4227151870727539, + "num_tokens": 7136183350.0, + "step": 13959 + }, + { + "epoch": 3.775013520822066, + "grad_norm": 13.9375, + "learning_rate": 0.004688928095710666, + "loss": 3.3413, + "mean_token_accuracy": 0.39295175671577454, + "num_tokens": 7136707589.0, + "step": 13960 + }, + { + "epoch": 3.7752839372633855, + "grad_norm": 3.5, + "learning_rate": 0.004687804258343494, + "loss": 2.9207, + "mean_token_accuracy": 0.4253901243209839, + "num_tokens": 7137214479.0, + "step": 13961 + }, + { + "epoch": 3.775554353704705, + "grad_norm": 3.46875, + "learning_rate": 0.004686680614653052, + "loss": 2.6576, + "mean_token_accuracy": 0.43941664695739746, + "num_tokens": 7137738655.0, + "step": 13962 + }, + { + "epoch": 3.775824770146025, + "grad_norm": 3.046875, + "learning_rate": 0.004685557164673815, + "loss": 2.8148, + "mean_token_accuracy": 0.42507994174957275, + "num_tokens": 7138257664.0, + "step": 13963 + }, + { + "epoch": 3.7760951865873444, + "grad_norm": 3.046875, + "learning_rate": 0.004684433908440252, + "loss": 3.0394, + "mean_token_accuracy": 0.4152553975582123, + "num_tokens": 7138750298.0, + "step": 13964 + }, + { + "epoch": 3.776365603028664, + "grad_norm": 2.765625, + "learning_rate": 0.004683310845986831, + "loss": 2.8553, + "mean_token_accuracy": 0.43967872858047485, + "num_tokens": 7139212348.0, + "step": 13965 + }, + { + "epoch": 3.7766360194699837, + "grad_norm": 3.0625, + "learning_rate": 0.004682187977348011, + "loss": 3.0588, + "mean_token_accuracy": 0.43606942892074585, + "num_tokens": 7139700743.0, + "step": 13966 + }, + { + "epoch": 3.7769064359113034, + "grad_norm": 3.046875, + "learning_rate": 0.00468106530255824, + "loss": 2.8066, + "mean_token_accuracy": 0.4108080267906189, + "num_tokens": 7140224901.0, + "step": 13967 + }, + { + "epoch": 3.777176852352623, + "grad_norm": 3.203125, + "learning_rate": 0.004679942821651968, + "loss": 3.0898, + "mean_token_accuracy": 0.3859465718269348, + "num_tokens": 7140690271.0, + "step": 13968 + }, + { + "epoch": 3.7774472687939427, + "grad_norm": 3.171875, + "learning_rate": 0.004678820534663639, + "loss": 2.9361, + "mean_token_accuracy": 0.4164731502532959, + "num_tokens": 7141214516.0, + "step": 13969 + }, + { + "epoch": 3.7777176852352623, + "grad_norm": 3.03125, + "learning_rate": 0.004677698441627683, + "loss": 2.8585, + "mean_token_accuracy": 0.4307974576950073, + "num_tokens": 7141727044.0, + "step": 13970 + }, + { + "epoch": 3.777988101676582, + "grad_norm": 7.53125, + "learning_rate": 0.004676576542578534, + "loss": 10.5468, + "mean_token_accuracy": 0.014912446029484272, + "num_tokens": 7142251057.0, + "step": 13971 + }, + { + "epoch": 3.7782585181179016, + "grad_norm": 6.03125, + "learning_rate": 0.004675454837550613, + "loss": 2.8831, + "mean_token_accuracy": 0.42660170793533325, + "num_tokens": 7142722354.0, + "step": 13972 + }, + { + "epoch": 3.7785289345592212, + "grad_norm": 2.171875, + "learning_rate": 0.004674333326578333, + "loss": 2.8312, + "mean_token_accuracy": 0.4219725728034973, + "num_tokens": 7143246562.0, + "step": 13973 + }, + { + "epoch": 3.778799351000541, + "grad_norm": 3.0, + "learning_rate": 0.004673212009696113, + "loss": 2.8062, + "mean_token_accuracy": 0.4948749244213104, + "num_tokens": 7143645941.0, + "step": 13974 + }, + { + "epoch": 3.7790697674418605, + "grad_norm": 2.78125, + "learning_rate": 0.004672090886938352, + "loss": 2.9604, + "mean_token_accuracy": 0.4229809045791626, + "num_tokens": 7144170185.0, + "step": 13975 + }, + { + "epoch": 3.77934018388318, + "grad_norm": 7.84375, + "learning_rate": 0.0046709699583394534, + "loss": 2.8923, + "mean_token_accuracy": 0.42277470231056213, + "num_tokens": 7144654862.0, + "step": 13976 + }, + { + "epoch": 3.7796106003245, + "grad_norm": 2.75, + "learning_rate": 0.00466984922393381, + "loss": 2.7376, + "mean_token_accuracy": 0.42141586542129517, + "num_tokens": 7145179112.0, + "step": 13977 + }, + { + "epoch": 3.7798810167658194, + "grad_norm": 2.53125, + "learning_rate": 0.004668728683755804, + "loss": 2.9609, + "mean_token_accuracy": 0.43434739112854004, + "num_tokens": 7145683915.0, + "step": 13978 + }, + { + "epoch": 3.780151433207139, + "grad_norm": 3.296875, + "learning_rate": 0.004667608337839825, + "loss": 2.8111, + "mean_token_accuracy": 0.4524928331375122, + "num_tokens": 7146188789.0, + "step": 13979 + }, + { + "epoch": 3.7804218496484587, + "grad_norm": 3.15625, + "learning_rate": 0.004666488186220243, + "loss": 2.7394, + "mean_token_accuracy": 0.4215152859687805, + "num_tokens": 7146713069.0, + "step": 13980 + }, + { + "epoch": 3.7806922660897784, + "grad_norm": 2.796875, + "learning_rate": 0.004665368228931428, + "loss": 2.8525, + "mean_token_accuracy": 0.43956458568573, + "num_tokens": 7147182642.0, + "step": 13981 + }, + { + "epoch": 3.780962682531098, + "grad_norm": 3.21875, + "learning_rate": 0.004664248466007747, + "loss": 2.8145, + "mean_token_accuracy": 0.4300379753112793, + "num_tokens": 7147696278.0, + "step": 13982 + }, + { + "epoch": 3.7812330989724177, + "grad_norm": 2.625, + "learning_rate": 0.004663128897483553, + "loss": 2.8274, + "mean_token_accuracy": 0.426593542098999, + "num_tokens": 7148220412.0, + "step": 13983 + }, + { + "epoch": 3.7815035154137373, + "grad_norm": 2.75, + "learning_rate": 0.0046620095233931985, + "loss": 2.8471, + "mean_token_accuracy": 0.4399145841598511, + "num_tokens": 7148744632.0, + "step": 13984 + }, + { + "epoch": 3.781773931855057, + "grad_norm": 3.34375, + "learning_rate": 0.004660890343771035, + "loss": 3.0419, + "mean_token_accuracy": 0.4156588315963745, + "num_tokens": 7149268867.0, + "step": 13985 + }, + { + "epoch": 3.7820443482963766, + "grad_norm": 3.484375, + "learning_rate": 0.004659771358651398, + "loss": 2.9032, + "mean_token_accuracy": 0.41642484068870544, + "num_tokens": 7149738071.0, + "step": 13986 + }, + { + "epoch": 3.782314764737696, + "grad_norm": 3.46875, + "learning_rate": 0.0046586525680686166, + "loss": 2.9419, + "mean_token_accuracy": 0.4285561442375183, + "num_tokens": 7150262335.0, + "step": 13987 + }, + { + "epoch": 3.782585181179016, + "grad_norm": 3.484375, + "learning_rate": 0.004657533972057026, + "loss": 2.8267, + "mean_token_accuracy": 0.44371795654296875, + "num_tokens": 7150752812.0, + "step": 13988 + }, + { + "epoch": 3.782855597620335, + "grad_norm": 2.609375, + "learning_rate": 0.004656415570650946, + "loss": 2.986, + "mean_token_accuracy": 0.42261257767677307, + "num_tokens": 7151260113.0, + "step": 13989 + }, + { + "epoch": 3.783126014061655, + "grad_norm": 3.125, + "learning_rate": 0.00465529736388469, + "loss": 2.7361, + "mean_token_accuracy": 0.4421575665473938, + "num_tokens": 7151784173.0, + "step": 13990 + }, + { + "epoch": 3.7833964305029744, + "grad_norm": 7.75, + "learning_rate": 0.00465417935179257, + "loss": 10.4813, + "mean_token_accuracy": 0.0, + "num_tokens": 7152308421.0, + "step": 13991 + }, + { + "epoch": 3.7836668469442944, + "grad_norm": 6.0625, + "learning_rate": 0.004653061534408891, + "loss": 3.051, + "mean_token_accuracy": 0.41504135727882385, + "num_tokens": 7152832592.0, + "step": 13992 + }, + { + "epoch": 3.7839372633856136, + "grad_norm": 2.828125, + "learning_rate": 0.004651943911767945, + "loss": 2.8228, + "mean_token_accuracy": 0.4137078821659088, + "num_tokens": 7153356799.0, + "step": 13993 + }, + { + "epoch": 3.7842076798269337, + "grad_norm": 2.96875, + "learning_rate": 0.004650826483904032, + "loss": 2.7667, + "mean_token_accuracy": 0.43144553899765015, + "num_tokens": 7153881006.0, + "step": 13994 + }, + { + "epoch": 3.784478096268253, + "grad_norm": 2.65625, + "learning_rate": 0.0046497092508514335, + "loss": 2.7427, + "mean_token_accuracy": 0.4335983097553253, + "num_tokens": 7154405290.0, + "step": 13995 + }, + { + "epoch": 3.7847485127095726, + "grad_norm": 3.46875, + "learning_rate": 0.004648592212644429, + "loss": 2.7217, + "mean_token_accuracy": 0.42457079887390137, + "num_tokens": 7154929485.0, + "step": 13996 + }, + { + "epoch": 3.785018929150892, + "grad_norm": 2.640625, + "learning_rate": 0.004647475369317291, + "loss": 2.7122, + "mean_token_accuracy": 0.4447953701019287, + "num_tokens": 7155453755.0, + "step": 13997 + }, + { + "epoch": 3.785289345592212, + "grad_norm": 3.046875, + "learning_rate": 0.004646358720904294, + "loss": 2.8445, + "mean_token_accuracy": 0.43494218587875366, + "num_tokens": 7155977900.0, + "step": 13998 + }, + { + "epoch": 3.7855597620335315, + "grad_norm": 2.640625, + "learning_rate": 0.004645242267439695, + "loss": 2.7145, + "mean_token_accuracy": 0.44014719128608704, + "num_tokens": 7156449586.0, + "step": 13999 + }, + { + "epoch": 3.785830178474851, + "grad_norm": 2.671875, + "learning_rate": 0.004644126008957752, + "loss": 3.0367, + "mean_token_accuracy": 0.4146552085876465, + "num_tokens": 7156973765.0, + "step": 14000 + }, + { + "epoch": 3.786100594916171, + "grad_norm": 2.671875, + "learning_rate": 0.004643009945492714, + "loss": 2.6664, + "mean_token_accuracy": 0.4386734068393707, + "num_tokens": 7157446362.0, + "step": 14001 + }, + { + "epoch": 3.7863710113574904, + "grad_norm": 2.578125, + "learning_rate": 0.0046418940770788265, + "loss": 2.8329, + "mean_token_accuracy": 0.43950146436691284, + "num_tokens": 7157951157.0, + "step": 14002 + }, + { + "epoch": 3.78664142779881, + "grad_norm": 3.203125, + "learning_rate": 0.004640778403750327, + "loss": 3.0543, + "mean_token_accuracy": 0.41028285026550293, + "num_tokens": 7158475390.0, + "step": 14003 + }, + { + "epoch": 3.7869118442401297, + "grad_norm": 3.140625, + "learning_rate": 0.004639662925541445, + "loss": 2.8645, + "mean_token_accuracy": 0.42688506841659546, + "num_tokens": 7158999598.0, + "step": 14004 + }, + { + "epoch": 3.7871822606814494, + "grad_norm": 30.625, + "learning_rate": 0.004638547642486411, + "loss": 2.7863, + "mean_token_accuracy": 0.4280353784561157, + "num_tokens": 7159523818.0, + "step": 14005 + }, + { + "epoch": 3.787452677122769, + "grad_norm": 4.5625, + "learning_rate": 0.0046374325546194445, + "loss": 2.8338, + "mean_token_accuracy": 0.412223219871521, + "num_tokens": 7160048040.0, + "step": 14006 + }, + { + "epoch": 3.7877230935640886, + "grad_norm": 2.671875, + "learning_rate": 0.004636317661974755, + "loss": 2.9605, + "mean_token_accuracy": 0.4295092225074768, + "num_tokens": 7160523733.0, + "step": 14007 + }, + { + "epoch": 3.7879935100054083, + "grad_norm": 4.3125, + "learning_rate": 0.004635202964586556, + "loss": 2.9232, + "mean_token_accuracy": 0.4326631426811218, + "num_tokens": 7161047883.0, + "step": 14008 + }, + { + "epoch": 3.788263926446728, + "grad_norm": 3.578125, + "learning_rate": 0.004634088462489048, + "loss": 3.1398, + "mean_token_accuracy": 0.3987038731575012, + "num_tokens": 7161572157.0, + "step": 14009 + }, + { + "epoch": 3.7885343428880476, + "grad_norm": 3.34375, + "learning_rate": 0.004632974155716425, + "loss": 3.0648, + "mean_token_accuracy": 0.4108177125453949, + "num_tokens": 7162096433.0, + "step": 14010 + }, + { + "epoch": 3.7888047593293672, + "grad_norm": 7.9375, + "learning_rate": 0.004631860044302879, + "loss": 9.9995, + "mean_token_accuracy": 0.04181334003806114, + "num_tokens": 7162620689.0, + "step": 14011 + }, + { + "epoch": 3.789075175770687, + "grad_norm": 6.875, + "learning_rate": 0.004630746128282596, + "loss": 3.1502, + "mean_token_accuracy": 0.41481080651283264, + "num_tokens": 7163115616.0, + "step": 14012 + }, + { + "epoch": 3.7893455922120065, + "grad_norm": 2.296875, + "learning_rate": 0.00462963240768975, + "loss": 3.1018, + "mean_token_accuracy": 0.413859099149704, + "num_tokens": 7163614772.0, + "step": 14013 + }, + { + "epoch": 3.789616008653326, + "grad_norm": 3.921875, + "learning_rate": 0.004628518882558519, + "loss": 3.007, + "mean_token_accuracy": 0.41265836358070374, + "num_tokens": 7164138913.0, + "step": 14014 + }, + { + "epoch": 3.789886425094646, + "grad_norm": 3.328125, + "learning_rate": 0.004627405552923066, + "loss": 2.7561, + "mean_token_accuracy": 0.44386035203933716, + "num_tokens": 7164663146.0, + "step": 14015 + }, + { + "epoch": 3.7901568415359654, + "grad_norm": 2.96875, + "learning_rate": 0.004626292418817547, + "loss": 2.8559, + "mean_token_accuracy": 0.432461678981781, + "num_tokens": 7165187284.0, + "step": 14016 + }, + { + "epoch": 3.790427257977285, + "grad_norm": 4.03125, + "learning_rate": 0.004625179480276125, + "loss": 2.8288, + "mean_token_accuracy": 0.456681489944458, + "num_tokens": 7165618725.0, + "step": 14017 + }, + { + "epoch": 3.7906976744186047, + "grad_norm": 3.421875, + "learning_rate": 0.004624066737332944, + "loss": 2.9836, + "mean_token_accuracy": 0.4132463335990906, + "num_tokens": 7166142918.0, + "step": 14018 + }, + { + "epoch": 3.7909680908599244, + "grad_norm": 3.4375, + "learning_rate": 0.0046229541900221416, + "loss": 2.9252, + "mean_token_accuracy": 0.4178328514099121, + "num_tokens": 7166667096.0, + "step": 14019 + }, + { + "epoch": 3.791238507301244, + "grad_norm": 3.234375, + "learning_rate": 0.004621841838377862, + "loss": 3.1135, + "mean_token_accuracy": 0.41259700059890747, + "num_tokens": 7167191276.0, + "step": 14020 + }, + { + "epoch": 3.7915089237425637, + "grad_norm": 3.046875, + "learning_rate": 0.004620729682434229, + "loss": 2.8364, + "mean_token_accuracy": 0.4377518594264984, + "num_tokens": 7167715494.0, + "step": 14021 + }, + { + "epoch": 3.7917793401838833, + "grad_norm": 2.65625, + "learning_rate": 0.004619617722225372, + "loss": 2.8981, + "mean_token_accuracy": 0.4293677806854248, + "num_tokens": 7168239762.0, + "step": 14022 + }, + { + "epoch": 3.792049756625203, + "grad_norm": 3.078125, + "learning_rate": 0.004618505957785408, + "loss": 2.6804, + "mean_token_accuracy": 0.44235920906066895, + "num_tokens": 7168764044.0, + "step": 14023 + }, + { + "epoch": 3.7923201730665226, + "grad_norm": 2.109375, + "learning_rate": 0.004617394389148443, + "loss": 3.1275, + "mean_token_accuracy": 0.4020864963531494, + "num_tokens": 7169288291.0, + "step": 14024 + }, + { + "epoch": 3.7925905895078422, + "grad_norm": 18.625, + "learning_rate": 0.004616283016348592, + "loss": 3.0522, + "mean_token_accuracy": 0.3801681399345398, + "num_tokens": 7169812549.0, + "step": 14025 + }, + { + "epoch": 3.792861005949162, + "grad_norm": 5.40625, + "learning_rate": 0.004615171839419949, + "loss": 3.0312, + "mean_token_accuracy": 0.42104870080947876, + "num_tokens": 7170336834.0, + "step": 14026 + }, + { + "epoch": 3.7931314223904815, + "grad_norm": 2.6875, + "learning_rate": 0.0046140608583966096, + "loss": 2.8921, + "mean_token_accuracy": 0.410455584526062, + "num_tokens": 7170861099.0, + "step": 14027 + }, + { + "epoch": 3.7934018388318007, + "grad_norm": 3.34375, + "learning_rate": 0.004612950073312666, + "loss": 3.0112, + "mean_token_accuracy": 0.4348623752593994, + "num_tokens": 7171321397.0, + "step": 14028 + }, + { + "epoch": 3.793672255273121, + "grad_norm": 3.859375, + "learning_rate": 0.004611839484202197, + "loss": 2.8009, + "mean_token_accuracy": 0.4280877113342285, + "num_tokens": 7171827441.0, + "step": 14029 + }, + { + "epoch": 3.79394267171444, + "grad_norm": 3.515625, + "learning_rate": 0.004610729091099277, + "loss": 2.7478, + "mean_token_accuracy": 0.43717747926712036, + "num_tokens": 7172351572.0, + "step": 14030 + }, + { + "epoch": 3.79421308815576, + "grad_norm": 2.15625, + "learning_rate": 0.00460961889403798, + "loss": 9.6739, + "mean_token_accuracy": 0.008652996271848679, + "num_tokens": 7172875721.0, + "step": 14031 + }, + { + "epoch": 3.7944835045970793, + "grad_norm": 7.5625, + "learning_rate": 0.00460850889305237, + "loss": 2.9922, + "mean_token_accuracy": 0.4193769693374634, + "num_tokens": 7173392895.0, + "step": 14032 + }, + { + "epoch": 3.7947539210383994, + "grad_norm": 2.28125, + "learning_rate": 0.004607399088176499, + "loss": 2.9143, + "mean_token_accuracy": 0.4294012486934662, + "num_tokens": 7173899227.0, + "step": 14033 + }, + { + "epoch": 3.7950243374797186, + "grad_norm": 2.703125, + "learning_rate": 0.0046062894794444255, + "loss": 2.7993, + "mean_token_accuracy": 0.44140830636024475, + "num_tokens": 7174387384.0, + "step": 14034 + }, + { + "epoch": 3.7952947539210387, + "grad_norm": 2.8125, + "learning_rate": 0.004605180066890194, + "loss": 2.7488, + "mean_token_accuracy": 0.430682897567749, + "num_tokens": 7174911578.0, + "step": 14035 + }, + { + "epoch": 3.795565170362358, + "grad_norm": 3.03125, + "learning_rate": 0.004604070850547841, + "loss": 3.1171, + "mean_token_accuracy": 0.4009019136428833, + "num_tokens": 7175435853.0, + "step": 14036 + }, + { + "epoch": 3.7958355868036775, + "grad_norm": 3.640625, + "learning_rate": 0.004602961830451408, + "loss": 2.9948, + "mean_token_accuracy": 0.42433586716651917, + "num_tokens": 7175927329.0, + "step": 14037 + }, + { + "epoch": 3.796106003244997, + "grad_norm": 3.3125, + "learning_rate": 0.004601853006634914, + "loss": 2.9412, + "mean_token_accuracy": 0.4280295968055725, + "num_tokens": 7176451442.0, + "step": 14038 + }, + { + "epoch": 3.796376419686317, + "grad_norm": 2.984375, + "learning_rate": 0.004600744379132389, + "loss": 3.0801, + "mean_token_accuracy": 0.3983675241470337, + "num_tokens": 7176975699.0, + "step": 14039 + }, + { + "epoch": 3.7966468361276364, + "grad_norm": 2.921875, + "learning_rate": 0.004599635947977842, + "loss": 3.0517, + "mean_token_accuracy": 0.4008672535419464, + "num_tokens": 7177499959.0, + "step": 14040 + }, + { + "epoch": 3.796917252568956, + "grad_norm": 3.0625, + "learning_rate": 0.004598527713205291, + "loss": 2.7005, + "mean_token_accuracy": 0.4478788375854492, + "num_tokens": 7177998423.0, + "step": 14041 + }, + { + "epoch": 3.7971876690102757, + "grad_norm": 3.328125, + "learning_rate": 0.00459741967484873, + "loss": 2.6276, + "mean_token_accuracy": 0.4538505971431732, + "num_tokens": 7178490061.0, + "step": 14042 + }, + { + "epoch": 3.7974580854515954, + "grad_norm": 2.859375, + "learning_rate": 0.004596311832942167, + "loss": 2.8512, + "mean_token_accuracy": 0.44270747900009155, + "num_tokens": 7178968015.0, + "step": 14043 + }, + { + "epoch": 3.797728501892915, + "grad_norm": 3.09375, + "learning_rate": 0.004595204187519585, + "loss": 2.9833, + "mean_token_accuracy": 0.4296465814113617, + "num_tokens": 7179492222.0, + "step": 14044 + }, + { + "epoch": 3.7979989183342346, + "grad_norm": 3.765625, + "learning_rate": 0.004594096738614976, + "loss": 2.9935, + "mean_token_accuracy": 0.43397802114486694, + "num_tokens": 7180016468.0, + "step": 14045 + }, + { + "epoch": 3.7982693347755543, + "grad_norm": 3.640625, + "learning_rate": 0.004592989486262319, + "loss": 2.748, + "mean_token_accuracy": 0.45255807042121887, + "num_tokens": 7180540704.0, + "step": 14046 + }, + { + "epoch": 3.798539751216874, + "grad_norm": 3.203125, + "learning_rate": 0.004591882430495582, + "loss": 2.7829, + "mean_token_accuracy": 0.43576183915138245, + "num_tokens": 7181064736.0, + "step": 14047 + }, + { + "epoch": 3.7988101676581936, + "grad_norm": 3.5625, + "learning_rate": 0.004590775571348742, + "loss": 2.581, + "mean_token_accuracy": 0.4381968677043915, + "num_tokens": 7181588890.0, + "step": 14048 + }, + { + "epoch": 3.799080584099513, + "grad_norm": 2.515625, + "learning_rate": 0.004589668908855754, + "loss": 2.9001, + "mean_token_accuracy": 0.4273757338523865, + "num_tokens": 7182113164.0, + "step": 14049 + }, + { + "epoch": 3.799351000540833, + "grad_norm": 4.03125, + "learning_rate": 0.004588562443050574, + "loss": 3.081, + "mean_token_accuracy": 0.41573721170425415, + "num_tokens": 7182637352.0, + "step": 14050 + }, + { + "epoch": 3.7996214169821525, + "grad_norm": 6.78125, + "learning_rate": 0.004587456173967154, + "loss": 9.3195, + "mean_token_accuracy": 0.016567867249250412, + "num_tokens": 7183161525.0, + "step": 14051 + }, + { + "epoch": 3.799891833423472, + "grad_norm": 6.21875, + "learning_rate": 0.0045863501016394385, + "loss": 2.9866, + "mean_token_accuracy": 0.4302212595939636, + "num_tokens": 7183685713.0, + "step": 14052 + }, + { + "epoch": 3.800162249864792, + "grad_norm": 2.21875, + "learning_rate": 0.00458524422610136, + "loss": 3.0753, + "mean_token_accuracy": 0.4442118704319, + "num_tokens": 7184089099.0, + "step": 14053 + }, + { + "epoch": 3.8004326663061114, + "grad_norm": 2.8125, + "learning_rate": 0.004584138547386853, + "loss": 2.7713, + "mean_token_accuracy": 0.41891467571258545, + "num_tokens": 7184613238.0, + "step": 14054 + }, + { + "epoch": 3.800703082747431, + "grad_norm": 29.0, + "learning_rate": 0.004583033065529846, + "loss": 3.1427, + "mean_token_accuracy": 0.4113755524158478, + "num_tokens": 7185137440.0, + "step": 14055 + }, + { + "epoch": 3.8009734991887507, + "grad_norm": 3.84375, + "learning_rate": 0.0045819277805642525, + "loss": 2.9278, + "mean_token_accuracy": 0.4073043465614319, + "num_tokens": 7185655226.0, + "step": 14056 + }, + { + "epoch": 3.8012439156300704, + "grad_norm": 3.3125, + "learning_rate": 0.004580822692523991, + "loss": 2.985, + "mean_token_accuracy": 0.4099518060684204, + "num_tokens": 7186179319.0, + "step": 14057 + }, + { + "epoch": 3.80151433207139, + "grad_norm": 3.59375, + "learning_rate": 0.004579717801442969, + "loss": 2.918, + "mean_token_accuracy": 0.4173092544078827, + "num_tokens": 7186703506.0, + "step": 14058 + }, + { + "epoch": 3.8017847485127096, + "grad_norm": 2.9375, + "learning_rate": 0.004578613107355081, + "loss": 3.0925, + "mean_token_accuracy": 0.43648046255111694, + "num_tokens": 7187164835.0, + "step": 14059 + }, + { + "epoch": 3.8020551649540293, + "grad_norm": 2.859375, + "learning_rate": 0.00457750861029423, + "loss": 2.8206, + "mean_token_accuracy": 0.4421123266220093, + "num_tokens": 7187662853.0, + "step": 14060 + }, + { + "epoch": 3.802325581395349, + "grad_norm": 3.1875, + "learning_rate": 0.0045764043102943025, + "loss": 2.6618, + "mean_token_accuracy": 0.43491610884666443, + "num_tokens": 7188186863.0, + "step": 14061 + }, + { + "epoch": 3.8025959978366686, + "grad_norm": 2.796875, + "learning_rate": 0.004575300207389179, + "loss": 2.7313, + "mean_token_accuracy": 0.43151408433914185, + "num_tokens": 7188711136.0, + "step": 14062 + }, + { + "epoch": 3.802866414277988, + "grad_norm": 3.671875, + "learning_rate": 0.004574196301612742, + "loss": 3.0553, + "mean_token_accuracy": 0.4403565227985382, + "num_tokens": 7189124418.0, + "step": 14063 + }, + { + "epoch": 3.803136830719308, + "grad_norm": 2.84375, + "learning_rate": 0.004573092592998858, + "loss": 2.851, + "mean_token_accuracy": 0.4412198066711426, + "num_tokens": 7189596419.0, + "step": 14064 + }, + { + "epoch": 3.8034072471606275, + "grad_norm": 3.125, + "learning_rate": 0.004571989081581392, + "loss": 2.8337, + "mean_token_accuracy": 0.4352089464664459, + "num_tokens": 7190120401.0, + "step": 14065 + }, + { + "epoch": 3.803677663601947, + "grad_norm": 2.84375, + "learning_rate": 0.004570885767394207, + "loss": 2.7625, + "mean_token_accuracy": 0.4494093954563141, + "num_tokens": 7190596051.0, + "step": 14066 + }, + { + "epoch": 3.803948080043267, + "grad_norm": 3.84375, + "learning_rate": 0.0045697826504711506, + "loss": 3.0399, + "mean_token_accuracy": 0.4304787516593933, + "num_tokens": 7191110431.0, + "step": 14067 + }, + { + "epoch": 3.8042184964845864, + "grad_norm": 3.328125, + "learning_rate": 0.004568679730846073, + "loss": 3.0803, + "mean_token_accuracy": 0.41206443309783936, + "num_tokens": 7191609179.0, + "step": 14068 + }, + { + "epoch": 3.8044889129259056, + "grad_norm": 2.671875, + "learning_rate": 0.004567577008552817, + "loss": 2.7312, + "mean_token_accuracy": 0.4416937530040741, + "num_tokens": 7192133392.0, + "step": 14069 + }, + { + "epoch": 3.8047593293672257, + "grad_norm": 2.515625, + "learning_rate": 0.004566474483625211, + "loss": 2.832, + "mean_token_accuracy": 0.4206814169883728, + "num_tokens": 7192657651.0, + "step": 14070 + }, + { + "epoch": 3.805029745808545, + "grad_norm": 53.0, + "learning_rate": 0.004565372156097092, + "loss": 15.0403, + "mean_token_accuracy": 0.00017100610421039164, + "num_tokens": 7193181846.0, + "step": 14071 + }, + { + "epoch": 3.805300162249865, + "grad_norm": 5.28125, + "learning_rate": 0.004564270026002277, + "loss": 2.918, + "mean_token_accuracy": 0.4363745450973511, + "num_tokens": 7193652232.0, + "step": 14072 + }, + { + "epoch": 3.805570578691184, + "grad_norm": 2.40625, + "learning_rate": 0.004563168093374583, + "loss": 2.812, + "mean_token_accuracy": 0.43346041440963745, + "num_tokens": 7194176487.0, + "step": 14073 + }, + { + "epoch": 3.8058409951325043, + "grad_norm": 3.015625, + "learning_rate": 0.004562066358247822, + "loss": 2.7548, + "mean_token_accuracy": 0.4286331236362457, + "num_tokens": 7194655919.0, + "step": 14074 + }, + { + "epoch": 3.8061114115738235, + "grad_norm": 2.65625, + "learning_rate": 0.004560964820655798, + "loss": 2.7822, + "mean_token_accuracy": 0.44679248332977295, + "num_tokens": 7195164751.0, + "step": 14075 + }, + { + "epoch": 3.8063818280151436, + "grad_norm": 3.265625, + "learning_rate": 0.004559863480632308, + "loss": 3.0187, + "mean_token_accuracy": 0.423467755317688, + "num_tokens": 7195688968.0, + "step": 14076 + }, + { + "epoch": 3.8066522444564628, + "grad_norm": 3.1875, + "learning_rate": 0.0045587623382111475, + "loss": 2.9536, + "mean_token_accuracy": 0.42242369055747986, + "num_tokens": 7196213177.0, + "step": 14077 + }, + { + "epoch": 3.806922660897783, + "grad_norm": 3.84375, + "learning_rate": 0.004557661393426101, + "loss": 3.1529, + "mean_token_accuracy": 0.4183916449546814, + "num_tokens": 7196689578.0, + "step": 14078 + }, + { + "epoch": 3.807193077339102, + "grad_norm": 4.75, + "learning_rate": 0.004556560646310946, + "loss": 2.8532, + "mean_token_accuracy": 0.4505447745323181, + "num_tokens": 7197169397.0, + "step": 14079 + }, + { + "epoch": 3.8074634937804217, + "grad_norm": 2.1875, + "learning_rate": 0.0045554600968994614, + "loss": 3.0532, + "mean_token_accuracy": 0.40624672174453735, + "num_tokens": 7197693682.0, + "step": 14080 + }, + { + "epoch": 3.8077339102217413, + "grad_norm": 3.65625, + "learning_rate": 0.004554359745225412, + "loss": 2.8253, + "mean_token_accuracy": 0.44433528184890747, + "num_tokens": 7198217901.0, + "step": 14081 + }, + { + "epoch": 3.808004326663061, + "grad_norm": 3.296875, + "learning_rate": 0.004553259591322562, + "loss": 2.6951, + "mean_token_accuracy": 0.43074163794517517, + "num_tokens": 7198722476.0, + "step": 14082 + }, + { + "epoch": 3.8082747431043806, + "grad_norm": 2.484375, + "learning_rate": 0.004552159635224665, + "loss": 2.844, + "mean_token_accuracy": 0.4129851758480072, + "num_tokens": 7199246745.0, + "step": 14083 + }, + { + "epoch": 3.8085451595457003, + "grad_norm": 3.078125, + "learning_rate": 0.004551059876965474, + "loss": 3.1905, + "mean_token_accuracy": 0.37734344601631165, + "num_tokens": 7199771007.0, + "step": 14084 + }, + { + "epoch": 3.80881557598702, + "grad_norm": 3.359375, + "learning_rate": 0.004549960316578728, + "loss": 2.9096, + "mean_token_accuracy": 0.4196716845035553, + "num_tokens": 7200295256.0, + "step": 14085 + }, + { + "epoch": 3.8090859924283396, + "grad_norm": 3.75, + "learning_rate": 0.004548860954098171, + "loss": 3.0123, + "mean_token_accuracy": 0.40411144495010376, + "num_tokens": 7200794647.0, + "step": 14086 + }, + { + "epoch": 3.809356408869659, + "grad_norm": 2.453125, + "learning_rate": 0.004547761789557533, + "loss": 2.7521, + "mean_token_accuracy": 0.4607153534889221, + "num_tokens": 7201236554.0, + "step": 14087 + }, + { + "epoch": 3.809626825310979, + "grad_norm": 2.734375, + "learning_rate": 0.004546662822990534, + "loss": 2.7992, + "mean_token_accuracy": 0.44708946347236633, + "num_tokens": 7201760796.0, + "step": 14088 + }, + { + "epoch": 3.8098972417522985, + "grad_norm": 3.765625, + "learning_rate": 0.0045455640544309, + "loss": 2.715, + "mean_token_accuracy": 0.4516700804233551, + "num_tokens": 7202178150.0, + "step": 14089 + }, + { + "epoch": 3.810167658193618, + "grad_norm": 3.25, + "learning_rate": 0.004544465483912339, + "loss": 2.8588, + "mean_token_accuracy": 0.44481539726257324, + "num_tokens": 7202702411.0, + "step": 14090 + }, + { + "epoch": 3.8104380746349378, + "grad_norm": 4.71875, + "learning_rate": 0.004543367111468566, + "loss": 10.0346, + "mean_token_accuracy": 0.011611707508563995, + "num_tokens": 7203226622.0, + "step": 14091 + }, + { + "epoch": 3.8107084910762574, + "grad_norm": 7.3125, + "learning_rate": 0.004542268937133276, + "loss": 3.0786, + "mean_token_accuracy": 0.3973131775856018, + "num_tokens": 7203750788.0, + "step": 14092 + }, + { + "epoch": 3.810978907517577, + "grad_norm": 2.546875, + "learning_rate": 0.004541170960940164, + "loss": 2.9677, + "mean_token_accuracy": 0.4256463050842285, + "num_tokens": 7204236812.0, + "step": 14093 + }, + { + "epoch": 3.8112493239588967, + "grad_norm": 3.0, + "learning_rate": 0.004540073182922925, + "loss": 2.9752, + "mean_token_accuracy": 0.4200776219367981, + "num_tokens": 7204760911.0, + "step": 14094 + }, + { + "epoch": 3.8115197404002163, + "grad_norm": 3.0625, + "learning_rate": 0.0045389756031152355, + "loss": 2.8264, + "mean_token_accuracy": 0.4424833655357361, + "num_tokens": 7205285046.0, + "step": 14095 + }, + { + "epoch": 3.811790156841536, + "grad_norm": 3.375, + "learning_rate": 0.004537878221550774, + "loss": 2.9139, + "mean_token_accuracy": 0.4398494064807892, + "num_tokens": 7205795588.0, + "step": 14096 + }, + { + "epoch": 3.8120605732828556, + "grad_norm": 3.125, + "learning_rate": 0.004536781038263212, + "loss": 2.707, + "mean_token_accuracy": 0.45498865842819214, + "num_tokens": 7206305517.0, + "step": 14097 + }, + { + "epoch": 3.8123309897241753, + "grad_norm": 3.296875, + "learning_rate": 0.004535684053286218, + "loss": 2.8216, + "mean_token_accuracy": 0.4229837656021118, + "num_tokens": 7206829715.0, + "step": 14098 + }, + { + "epoch": 3.812601406165495, + "grad_norm": 2.640625, + "learning_rate": 0.004534587266653445, + "loss": 2.8585, + "mean_token_accuracy": 0.4320710003376007, + "num_tokens": 7207353975.0, + "step": 14099 + }, + { + "epoch": 3.8128718226068146, + "grad_norm": 3.859375, + "learning_rate": 0.00453349067839855, + "loss": 2.7602, + "mean_token_accuracy": 0.466091126203537, + "num_tokens": 7207878248.0, + "step": 14100 + }, + { + "epoch": 3.813142239048134, + "grad_norm": 2.46875, + "learning_rate": 0.004532394288555179, + "loss": 2.9194, + "mean_token_accuracy": 0.4519827663898468, + "num_tokens": 7208340584.0, + "step": 14101 + }, + { + "epoch": 3.813412655489454, + "grad_norm": 3.28125, + "learning_rate": 0.004531298097156967, + "loss": 2.9886, + "mean_token_accuracy": 0.4298480153083801, + "num_tokens": 7208864805.0, + "step": 14102 + }, + { + "epoch": 3.8136830719307735, + "grad_norm": 2.90625, + "learning_rate": 0.0045302021042375575, + "loss": 3.0137, + "mean_token_accuracy": 0.43777167797088623, + "num_tokens": 7209326674.0, + "step": 14103 + }, + { + "epoch": 3.813953488372093, + "grad_norm": 3.1875, + "learning_rate": 0.004529106309830572, + "loss": 2.5413, + "mean_token_accuracy": 0.4661028981208801, + "num_tokens": 7209791289.0, + "step": 14104 + }, + { + "epoch": 3.814223904813413, + "grad_norm": 2.625, + "learning_rate": 0.004528010713969632, + "loss": 2.8487, + "mean_token_accuracy": 0.4411245286464691, + "num_tokens": 7210260038.0, + "step": 14105 + }, + { + "epoch": 3.8144943212547324, + "grad_norm": 2.828125, + "learning_rate": 0.004526915316688361, + "loss": 2.8169, + "mean_token_accuracy": 0.42542189359664917, + "num_tokens": 7210784303.0, + "step": 14106 + }, + { + "epoch": 3.814764737696052, + "grad_norm": 3.15625, + "learning_rate": 0.0045258201180203616, + "loss": 2.8522, + "mean_token_accuracy": 0.42496803402900696, + "num_tokens": 7211308564.0, + "step": 14107 + }, + { + "epoch": 3.8150351541373717, + "grad_norm": 2.609375, + "learning_rate": 0.004524725117999239, + "loss": 2.903, + "mean_token_accuracy": 0.43819940090179443, + "num_tokens": 7211832833.0, + "step": 14108 + }, + { + "epoch": 3.8153055705786914, + "grad_norm": 3.765625, + "learning_rate": 0.004523630316658593, + "loss": 2.9598, + "mean_token_accuracy": 0.43648362159729004, + "num_tokens": 7212299898.0, + "step": 14109 + }, + { + "epoch": 3.8155759870200106, + "grad_norm": 3.0625, + "learning_rate": 0.004522535714032014, + "loss": 2.8598, + "mean_token_accuracy": 0.4434208273887634, + "num_tokens": 7212805509.0, + "step": 14110 + }, + { + "epoch": 3.8158464034613306, + "grad_norm": 3.40625, + "learning_rate": 0.004521441310153088, + "loss": 9.7641, + "mean_token_accuracy": 0.02426719292998314, + "num_tokens": 7213329700.0, + "step": 14111 + }, + { + "epoch": 3.81611681990265, + "grad_norm": 8.5, + "learning_rate": 0.004520347105055398, + "loss": 3.0537, + "mean_token_accuracy": 0.39978379011154175, + "num_tokens": 7213853863.0, + "step": 14112 + }, + { + "epoch": 3.81638723634397, + "grad_norm": 2.6875, + "learning_rate": 0.00451925309877251, + "loss": 2.7846, + "mean_token_accuracy": 0.43939733505249023, + "num_tokens": 7214378000.0, + "step": 14113 + }, + { + "epoch": 3.816657652785289, + "grad_norm": 4.5, + "learning_rate": 0.004518159291338, + "loss": 3.026, + "mean_token_accuracy": 0.47762057185173035, + "num_tokens": 7214752216.0, + "step": 14114 + }, + { + "epoch": 3.816928069226609, + "grad_norm": 3.4375, + "learning_rate": 0.0045170656827854235, + "loss": 2.8011, + "mean_token_accuracy": 0.4499237537384033, + "num_tokens": 7215276492.0, + "step": 14115 + }, + { + "epoch": 3.8171984856679284, + "grad_norm": 4.46875, + "learning_rate": 0.004515972273148334, + "loss": 2.8797, + "mean_token_accuracy": 0.4242863357067108, + "num_tokens": 7215769421.0, + "step": 14116 + }, + { + "epoch": 3.8174689021092485, + "grad_norm": 2.859375, + "learning_rate": 0.004514879062460287, + "loss": 2.7261, + "mean_token_accuracy": 0.41485852003097534, + "num_tokens": 7216293681.0, + "step": 14117 + }, + { + "epoch": 3.8177393185505677, + "grad_norm": 3.328125, + "learning_rate": 0.004513786050754822, + "loss": 2.7715, + "mean_token_accuracy": 0.4372621774673462, + "num_tokens": 7216798081.0, + "step": 14118 + }, + { + "epoch": 3.818009734991888, + "grad_norm": 2.734375, + "learning_rate": 0.004512693238065473, + "loss": 2.8864, + "mean_token_accuracy": 0.42541414499282837, + "num_tokens": 7217322240.0, + "step": 14119 + }, + { + "epoch": 3.818280151433207, + "grad_norm": 3.1875, + "learning_rate": 0.004511600624425774, + "loss": 2.771, + "mean_token_accuracy": 0.44188639521598816, + "num_tokens": 7217845552.0, + "step": 14120 + }, + { + "epoch": 3.8185505678745266, + "grad_norm": 2.953125, + "learning_rate": 0.004510508209869252, + "loss": 2.8074, + "mean_token_accuracy": 0.4416714310646057, + "num_tokens": 7218369749.0, + "step": 14121 + }, + { + "epoch": 3.8188209843158463, + "grad_norm": 2.859375, + "learning_rate": 0.004509415994429417, + "loss": 2.6489, + "mean_token_accuracy": 0.44927194714546204, + "num_tokens": 7218893926.0, + "step": 14122 + }, + { + "epoch": 3.819091400757166, + "grad_norm": 2.6875, + "learning_rate": 0.004508323978139792, + "loss": 2.9715, + "mean_token_accuracy": 0.41245532035827637, + "num_tokens": 7219418203.0, + "step": 14123 + }, + { + "epoch": 3.8193618171984856, + "grad_norm": 3.109375, + "learning_rate": 0.004507232161033874, + "loss": 2.783, + "mean_token_accuracy": 0.4490107297897339, + "num_tokens": 7219897392.0, + "step": 14124 + }, + { + "epoch": 3.819632233639805, + "grad_norm": 2.9375, + "learning_rate": 0.004506140543145171, + "loss": 2.8324, + "mean_token_accuracy": 0.404577374458313, + "num_tokens": 7220421467.0, + "step": 14125 + }, + { + "epoch": 3.819902650081125, + "grad_norm": 3.734375, + "learning_rate": 0.00450504912450717, + "loss": 2.7588, + "mean_token_accuracy": 0.41390228271484375, + "num_tokens": 7220945722.0, + "step": 14126 + }, + { + "epoch": 3.8201730665224445, + "grad_norm": 4.25, + "learning_rate": 0.004503957905153364, + "loss": 2.8705, + "mean_token_accuracy": 0.42779192328453064, + "num_tokens": 7221469918.0, + "step": 14127 + }, + { + "epoch": 3.820443482963764, + "grad_norm": 2.46875, + "learning_rate": 0.004502866885117232, + "loss": 2.8632, + "mean_token_accuracy": 0.4447892904281616, + "num_tokens": 7221931160.0, + "step": 14128 + }, + { + "epoch": 3.8207138994050838, + "grad_norm": 3.125, + "learning_rate": 0.004501776064432253, + "loss": 2.6612, + "mean_token_accuracy": 0.45218610763549805, + "num_tokens": 7222453131.0, + "step": 14129 + }, + { + "epoch": 3.8209843158464034, + "grad_norm": 2.4375, + "learning_rate": 0.004500685443131894, + "loss": 2.8334, + "mean_token_accuracy": 0.4547540545463562, + "num_tokens": 7222874086.0, + "step": 14130 + }, + { + "epoch": 3.821254732287723, + "grad_norm": 9.375, + "learning_rate": 0.004499595021249616, + "loss": 9.4454, + "mean_token_accuracy": 0.040121130645275116, + "num_tokens": 7223332896.0, + "step": 14131 + }, + { + "epoch": 3.8215251487290427, + "grad_norm": 7.15625, + "learning_rate": 0.004498504798818882, + "loss": 3.0602, + "mean_token_accuracy": 0.40201255679130554, + "num_tokens": 7223857096.0, + "step": 14132 + }, + { + "epoch": 3.8217955651703623, + "grad_norm": 2.671875, + "learning_rate": 0.004497414775873141, + "loss": 2.8375, + "mean_token_accuracy": 0.4407300353050232, + "num_tokens": 7224381337.0, + "step": 14133 + }, + { + "epoch": 3.822065981611682, + "grad_norm": 3.296875, + "learning_rate": 0.004496324952445835, + "loss": 2.7202, + "mean_token_accuracy": 0.45034289360046387, + "num_tokens": 7224905601.0, + "step": 14134 + }, + { + "epoch": 3.8223363980530016, + "grad_norm": 2.703125, + "learning_rate": 0.004495235328570407, + "loss": 3.0238, + "mean_token_accuracy": 0.425325870513916, + "num_tokens": 7225401611.0, + "step": 14135 + }, + { + "epoch": 3.8226068144943213, + "grad_norm": 3.90625, + "learning_rate": 0.004494145904280287, + "loss": 2.7312, + "mean_token_accuracy": 0.44367626309394836, + "num_tokens": 7225898046.0, + "step": 14136 + }, + { + "epoch": 3.822877230935641, + "grad_norm": 2.359375, + "learning_rate": 0.004493056679608904, + "loss": 2.7719, + "mean_token_accuracy": 0.437641978263855, + "num_tokens": 7226422326.0, + "step": 14137 + }, + { + "epoch": 3.8231476473769606, + "grad_norm": 3.3125, + "learning_rate": 0.004491967654589677, + "loss": 2.8582, + "mean_token_accuracy": 0.43462586402893066, + "num_tokens": 7226890391.0, + "step": 14138 + }, + { + "epoch": 3.82341806381828, + "grad_norm": 2.84375, + "learning_rate": 0.0044908788292560235, + "loss": 2.9028, + "mean_token_accuracy": 0.4359178841114044, + "num_tokens": 7227414608.0, + "step": 14139 + }, + { + "epoch": 3.8236884802596, + "grad_norm": 3.421875, + "learning_rate": 0.004489790203641346, + "loss": 2.8442, + "mean_token_accuracy": 0.40811219811439514, + "num_tokens": 7227938621.0, + "step": 14140 + }, + { + "epoch": 3.8239588967009195, + "grad_norm": 3.59375, + "learning_rate": 0.004488701777779054, + "loss": 2.8692, + "mean_token_accuracy": 0.43004855513572693, + "num_tokens": 7228462777.0, + "step": 14141 + }, + { + "epoch": 3.824229313142239, + "grad_norm": 3.5625, + "learning_rate": 0.0044876135517025375, + "loss": 2.8932, + "mean_token_accuracy": 0.43153417110443115, + "num_tokens": 7228929268.0, + "step": 14142 + }, + { + "epoch": 3.8244997295835588, + "grad_norm": 3.578125, + "learning_rate": 0.004486525525445191, + "loss": 2.7545, + "mean_token_accuracy": 0.44204986095428467, + "num_tokens": 7229453480.0, + "step": 14143 + }, + { + "epoch": 3.8247701460248784, + "grad_norm": 3.171875, + "learning_rate": 0.004485437699040395, + "loss": 2.7964, + "mean_token_accuracy": 0.4322562515735626, + "num_tokens": 7229934809.0, + "step": 14144 + }, + { + "epoch": 3.825040562466198, + "grad_norm": 3.09375, + "learning_rate": 0.004484350072521528, + "loss": 2.9352, + "mean_token_accuracy": 0.4157593846321106, + "num_tokens": 7230459060.0, + "step": 14145 + }, + { + "epoch": 3.8253109789075177, + "grad_norm": 3.375, + "learning_rate": 0.004483262645921963, + "loss": 2.8403, + "mean_token_accuracy": 0.42867231369018555, + "num_tokens": 7230983131.0, + "step": 14146 + }, + { + "epoch": 3.8255813953488373, + "grad_norm": 2.921875, + "learning_rate": 0.004482175419275065, + "loss": 2.9163, + "mean_token_accuracy": 0.44718924164772034, + "num_tokens": 7231398714.0, + "step": 14147 + }, + { + "epoch": 3.825851811790157, + "grad_norm": 3.203125, + "learning_rate": 0.004481088392614191, + "loss": 2.998, + "mean_token_accuracy": 0.422370046377182, + "num_tokens": 7231922885.0, + "step": 14148 + }, + { + "epoch": 3.8261222282314766, + "grad_norm": 3.015625, + "learning_rate": 0.0044800015659726975, + "loss": 2.7205, + "mean_token_accuracy": 0.43370649218559265, + "num_tokens": 7232447045.0, + "step": 14149 + }, + { + "epoch": 3.8263926446727963, + "grad_norm": 2.578125, + "learning_rate": 0.004478914939383931, + "loss": 2.8962, + "mean_token_accuracy": 0.43874576687812805, + "num_tokens": 7232971109.0, + "step": 14150 + }, + { + "epoch": 3.8266630611141155, + "grad_norm": 17.625, + "learning_rate": 0.004477828512881229, + "loss": 11.2125, + "mean_token_accuracy": 0.0013132415479049087, + "num_tokens": 7233495293.0, + "step": 14151 + }, + { + "epoch": 3.8269334775554356, + "grad_norm": 9.75, + "learning_rate": 0.00447674228649793, + "loss": 3.0549, + "mean_token_accuracy": 0.4288436472415924, + "num_tokens": 7234019449.0, + "step": 14152 + }, + { + "epoch": 3.8272038939967548, + "grad_norm": 3.515625, + "learning_rate": 0.004475656260267359, + "loss": 3.0004, + "mean_token_accuracy": 0.39524000883102417, + "num_tokens": 7234543604.0, + "step": 14153 + }, + { + "epoch": 3.827474310438075, + "grad_norm": 3.484375, + "learning_rate": 0.0044745704342228415, + "loss": 2.7512, + "mean_token_accuracy": 0.4201245903968811, + "num_tokens": 7235067833.0, + "step": 14154 + }, + { + "epoch": 3.827744726879394, + "grad_norm": 3.21875, + "learning_rate": 0.004473484808397695, + "loss": 3.0328, + "mean_token_accuracy": 0.4303114116191864, + "num_tokens": 7235592100.0, + "step": 14155 + }, + { + "epoch": 3.828015143320714, + "grad_norm": 2.9375, + "learning_rate": 0.004472399382825228, + "loss": 2.9916, + "mean_token_accuracy": 0.4715162217617035, + "num_tokens": 7236050829.0, + "step": 14156 + }, + { + "epoch": 3.8282855597620333, + "grad_norm": 3.921875, + "learning_rate": 0.004471314157538743, + "loss": 2.7795, + "mean_token_accuracy": 0.43524420261383057, + "num_tokens": 7236574849.0, + "step": 14157 + }, + { + "epoch": 3.8285559762033534, + "grad_norm": 2.1875, + "learning_rate": 0.004470229132571541, + "loss": 2.819, + "mean_token_accuracy": 0.4485674500465393, + "num_tokens": 7237099112.0, + "step": 14158 + }, + { + "epoch": 3.8288263926446726, + "grad_norm": 3.390625, + "learning_rate": 0.004469144307956909, + "loss": 2.7095, + "mean_token_accuracy": 0.44314152002334595, + "num_tokens": 7237623241.0, + "step": 14159 + }, + { + "epoch": 3.8290968090859927, + "grad_norm": 2.625, + "learning_rate": 0.004468059683728138, + "loss": 3.0229, + "mean_token_accuracy": 0.4151931703090668, + "num_tokens": 7238147505.0, + "step": 14160 + }, + { + "epoch": 3.829367225527312, + "grad_norm": 3.375, + "learning_rate": 0.004466975259918506, + "loss": 2.6399, + "mean_token_accuracy": 0.44315722584724426, + "num_tokens": 7238671699.0, + "step": 14161 + }, + { + "epoch": 3.8296376419686315, + "grad_norm": 2.578125, + "learning_rate": 0.004465891036561281, + "loss": 2.9224, + "mean_token_accuracy": 0.4345770478248596, + "num_tokens": 7239195888.0, + "step": 14162 + }, + { + "epoch": 3.829908058409951, + "grad_norm": 3.515625, + "learning_rate": 0.00446480701368974, + "loss": 2.8885, + "mean_token_accuracy": 0.4273167550563812, + "num_tokens": 7239720146.0, + "step": 14163 + }, + { + "epoch": 3.830178474851271, + "grad_norm": 2.734375, + "learning_rate": 0.0044637231913371355, + "loss": 2.8541, + "mean_token_accuracy": 0.4221431016921997, + "num_tokens": 7240244172.0, + "step": 14164 + }, + { + "epoch": 3.8304488912925905, + "grad_norm": 3.203125, + "learning_rate": 0.004462639569536725, + "loss": 2.8205, + "mean_token_accuracy": 0.42650124430656433, + "num_tokens": 7240768256.0, + "step": 14165 + }, + { + "epoch": 3.83071930773391, + "grad_norm": 3.3125, + "learning_rate": 0.0044615561483217605, + "loss": 2.9689, + "mean_token_accuracy": 0.4276497960090637, + "num_tokens": 7241242968.0, + "step": 14166 + }, + { + "epoch": 3.8309897241752298, + "grad_norm": 3.34375, + "learning_rate": 0.004460472927725477, + "loss": 2.9333, + "mean_token_accuracy": 0.4280744791030884, + "num_tokens": 7241767235.0, + "step": 14167 + }, + { + "epoch": 3.8312601406165494, + "grad_norm": 2.84375, + "learning_rate": 0.00445938990778112, + "loss": 2.7751, + "mean_token_accuracy": 0.4590241014957428, + "num_tokens": 7242291517.0, + "step": 14168 + }, + { + "epoch": 3.831530557057869, + "grad_norm": 3.421875, + "learning_rate": 0.004458307088521911, + "loss": 2.8031, + "mean_token_accuracy": 0.4263617694377899, + "num_tokens": 7242815673.0, + "step": 14169 + }, + { + "epoch": 3.8318009734991887, + "grad_norm": 2.71875, + "learning_rate": 0.004457224469981084, + "loss": 2.7394, + "mean_token_accuracy": 0.4400368332862854, + "num_tokens": 7243339788.0, + "step": 14170 + }, + { + "epoch": 3.8320713899405083, + "grad_norm": 6.40625, + "learning_rate": 0.004456142052191845, + "loss": 9.8111, + "mean_token_accuracy": 0.015864785760641098, + "num_tokens": 7243820358.0, + "step": 14171 + }, + { + "epoch": 3.832341806381828, + "grad_norm": 8.625, + "learning_rate": 0.004455059835187417, + "loss": 3.0759, + "mean_token_accuracy": 0.4209340214729309, + "num_tokens": 7244341370.0, + "step": 14172 + }, + { + "epoch": 3.8326122228231476, + "grad_norm": 2.5625, + "learning_rate": 0.004453977819001, + "loss": 2.8819, + "mean_token_accuracy": 0.4392009973526001, + "num_tokens": 7244865519.0, + "step": 14173 + }, + { + "epoch": 3.8328826392644673, + "grad_norm": 2.578125, + "learning_rate": 0.004452896003665792, + "loss": 2.8663, + "mean_token_accuracy": 0.4733273983001709, + "num_tokens": 7245326669.0, + "step": 14174 + }, + { + "epoch": 3.833153055705787, + "grad_norm": 3.640625, + "learning_rate": 0.00445181438921499, + "loss": 2.9364, + "mean_token_accuracy": 0.43821287155151367, + "num_tokens": 7245832985.0, + "step": 14175 + }, + { + "epoch": 3.8334234721471065, + "grad_norm": 3.390625, + "learning_rate": 0.004450732975681781, + "loss": 2.8412, + "mean_token_accuracy": 0.432841420173645, + "num_tokens": 7246357073.0, + "step": 14176 + }, + { + "epoch": 3.833693888588426, + "grad_norm": 2.8125, + "learning_rate": 0.0044496517630993404, + "loss": 2.7654, + "mean_token_accuracy": 0.4446010887622833, + "num_tokens": 7246881212.0, + "step": 14177 + }, + { + "epoch": 3.833964305029746, + "grad_norm": 3.21875, + "learning_rate": 0.00444857075150085, + "loss": 2.7807, + "mean_token_accuracy": 0.44156256318092346, + "num_tokens": 7247405352.0, + "step": 14178 + }, + { + "epoch": 3.8342347214710655, + "grad_norm": 2.9375, + "learning_rate": 0.004447489940919476, + "loss": 2.7309, + "mean_token_accuracy": 0.4570443034172058, + "num_tokens": 7247929601.0, + "step": 14179 + }, + { + "epoch": 3.834505137912385, + "grad_norm": 3.59375, + "learning_rate": 0.004446409331388377, + "loss": 2.9699, + "mean_token_accuracy": 0.4263010621070862, + "num_tokens": 7248433135.0, + "step": 14180 + }, + { + "epoch": 3.8347755543537048, + "grad_norm": 3.21875, + "learning_rate": 0.0044453289229407136, + "loss": 2.8607, + "mean_token_accuracy": 0.45263904333114624, + "num_tokens": 7248902957.0, + "step": 14181 + }, + { + "epoch": 3.8350459707950244, + "grad_norm": 3.46875, + "learning_rate": 0.004444248715609635, + "loss": 2.8571, + "mean_token_accuracy": 0.4394735097885132, + "num_tokens": 7249398213.0, + "step": 14182 + }, + { + "epoch": 3.835316387236344, + "grad_norm": 2.84375, + "learning_rate": 0.004443168709428285, + "loss": 2.9042, + "mean_token_accuracy": 0.43185049295425415, + "num_tokens": 7249922335.0, + "step": 14183 + }, + { + "epoch": 3.8355868036776637, + "grad_norm": 3.234375, + "learning_rate": 0.004442088904429803, + "loss": 2.6875, + "mean_token_accuracy": 0.4375656545162201, + "num_tokens": 7250446565.0, + "step": 14184 + }, + { + "epoch": 3.8358572201189833, + "grad_norm": 2.515625, + "learning_rate": 0.004441009300647317, + "loss": 2.9296, + "mean_token_accuracy": 0.4381589889526367, + "num_tokens": 7250940152.0, + "step": 14185 + }, + { + "epoch": 3.836127636560303, + "grad_norm": 3.234375, + "learning_rate": 0.004439929898113956, + "loss": 2.8356, + "mean_token_accuracy": 0.43760621547698975, + "num_tokens": 7251464412.0, + "step": 14186 + }, + { + "epoch": 3.8363980530016226, + "grad_norm": 3.75, + "learning_rate": 0.0044388506968628385, + "loss": 2.9659, + "mean_token_accuracy": 0.42911049723625183, + "num_tokens": 7251988544.0, + "step": 14187 + }, + { + "epoch": 3.8366684694429423, + "grad_norm": 3.796875, + "learning_rate": 0.004437771696927073, + "loss": 3.0241, + "mean_token_accuracy": 0.42441055178642273, + "num_tokens": 7252506504.0, + "step": 14188 + }, + { + "epoch": 3.836938885884262, + "grad_norm": 3.625, + "learning_rate": 0.004436692898339775, + "loss": 2.7303, + "mean_token_accuracy": 0.46257829666137695, + "num_tokens": 7253030657.0, + "step": 14189 + }, + { + "epoch": 3.8372093023255816, + "grad_norm": 2.96875, + "learning_rate": 0.004435614301134038, + "loss": 2.9536, + "mean_token_accuracy": 0.4308702349662781, + "num_tokens": 7253554931.0, + "step": 14190 + }, + { + "epoch": 3.837479718766901, + "grad_norm": 4.875, + "learning_rate": 0.004434535905342957, + "loss": 9.7905, + "mean_token_accuracy": 0.008618950843811035, + "num_tokens": 7254079138.0, + "step": 14191 + }, + { + "epoch": 3.8377501352082204, + "grad_norm": 7.15625, + "learning_rate": 0.004433457710999625, + "loss": 2.9197, + "mean_token_accuracy": 0.4264047145843506, + "num_tokens": 7254552043.0, + "step": 14192 + }, + { + "epoch": 3.8380205516495405, + "grad_norm": 2.875, + "learning_rate": 0.00443237971813712, + "loss": 3.0581, + "mean_token_accuracy": 0.421114444732666, + "num_tokens": 7255076255.0, + "step": 14193 + }, + { + "epoch": 3.8382909680908597, + "grad_norm": 3.71875, + "learning_rate": 0.004431301926788519, + "loss": 2.8452, + "mean_token_accuracy": 0.4338093101978302, + "num_tokens": 7255600516.0, + "step": 14194 + }, + { + "epoch": 3.8385613845321798, + "grad_norm": 3.671875, + "learning_rate": 0.004430224336986893, + "loss": 2.8579, + "mean_token_accuracy": 0.4341527223587036, + "num_tokens": 7256115997.0, + "step": 14195 + }, + { + "epoch": 3.838831800973499, + "grad_norm": 4.21875, + "learning_rate": 0.004429146948765303, + "loss": 2.776, + "mean_token_accuracy": 0.4540776312351227, + "num_tokens": 7256608715.0, + "step": 14196 + }, + { + "epoch": 3.839102217414819, + "grad_norm": 2.9375, + "learning_rate": 0.004428069762156809, + "loss": 2.7038, + "mean_token_accuracy": 0.4737546443939209, + "num_tokens": 7257058775.0, + "step": 14197 + }, + { + "epoch": 3.8393726338561383, + "grad_norm": 2.859375, + "learning_rate": 0.004426992777194462, + "loss": 2.8874, + "mean_token_accuracy": 0.4499240219593048, + "num_tokens": 7257523066.0, + "step": 14198 + }, + { + "epoch": 3.8396430502974583, + "grad_norm": 3.578125, + "learning_rate": 0.004425915993911308, + "loss": 2.7919, + "mean_token_accuracy": 0.3870384097099304, + "num_tokens": 7258047310.0, + "step": 14199 + }, + { + "epoch": 3.8399134667387775, + "grad_norm": 1.9140625, + "learning_rate": 0.004424839412340381, + "loss": 2.8738, + "mean_token_accuracy": 0.44268858432769775, + "num_tokens": 7258571487.0, + "step": 14200 + }, + { + "epoch": 3.8401838831800976, + "grad_norm": 4.21875, + "learning_rate": 0.004423763032514721, + "loss": 2.8871, + "mean_token_accuracy": 0.4348434805870056, + "num_tokens": 7259092333.0, + "step": 14201 + }, + { + "epoch": 3.840454299621417, + "grad_norm": 3.171875, + "learning_rate": 0.0044226868544673505, + "loss": 2.9856, + "mean_token_accuracy": 0.43536460399627686, + "num_tokens": 7259566798.0, + "step": 14202 + }, + { + "epoch": 3.8407247160627365, + "grad_norm": 4.0625, + "learning_rate": 0.004421610878231286, + "loss": 2.8483, + "mean_token_accuracy": 0.4282814860343933, + "num_tokens": 7260091002.0, + "step": 14203 + }, + { + "epoch": 3.840995132504056, + "grad_norm": 2.03125, + "learning_rate": 0.0044205351038395495, + "loss": 2.6968, + "mean_token_accuracy": 0.44401171803474426, + "num_tokens": 7260615250.0, + "step": 14204 + }, + { + "epoch": 3.8412655489453758, + "grad_norm": 3.015625, + "learning_rate": 0.004419459531325142, + "loss": 2.7542, + "mean_token_accuracy": 0.44956550002098083, + "num_tokens": 7261139519.0, + "step": 14205 + }, + { + "epoch": 3.8415359653866954, + "grad_norm": 3.296875, + "learning_rate": 0.004418384160721069, + "loss": 2.837, + "mean_token_accuracy": 0.4424372911453247, + "num_tokens": 7261663580.0, + "step": 14206 + }, + { + "epoch": 3.841806381828015, + "grad_norm": 3.390625, + "learning_rate": 0.004417308992060327, + "loss": 2.9177, + "mean_token_accuracy": 0.4226487874984741, + "num_tokens": 7262187773.0, + "step": 14207 + }, + { + "epoch": 3.8420767982693347, + "grad_norm": 2.75, + "learning_rate": 0.0044162340253759, + "loss": 2.6612, + "mean_token_accuracy": 0.44142067432403564, + "num_tokens": 7262711905.0, + "step": 14208 + }, + { + "epoch": 3.8423472147106543, + "grad_norm": 3.640625, + "learning_rate": 0.00441515926070078, + "loss": 2.8712, + "mean_token_accuracy": 0.4105682373046875, + "num_tokens": 7263236106.0, + "step": 14209 + }, + { + "epoch": 3.842617631151974, + "grad_norm": 3.1875, + "learning_rate": 0.004414084698067933, + "loss": 2.9666, + "mean_token_accuracy": 0.4336230158805847, + "num_tokens": 7263760351.0, + "step": 14210 + }, + { + "epoch": 3.8428880475932936, + "grad_norm": 15.1875, + "learning_rate": 0.004413010337510337, + "loss": 9.5819, + "mean_token_accuracy": 0.0333746001124382, + "num_tokens": 7264244100.0, + "step": 14211 + }, + { + "epoch": 3.8431584640346133, + "grad_norm": 8.1875, + "learning_rate": 0.004411936179060957, + "loss": 3.1171, + "mean_token_accuracy": 0.40209871530532837, + "num_tokens": 7264768378.0, + "step": 14212 + }, + { + "epoch": 3.843428880475933, + "grad_norm": 2.328125, + "learning_rate": 0.004410862222752748, + "loss": 2.8519, + "mean_token_accuracy": 0.42557334899902344, + "num_tokens": 7265274176.0, + "step": 14213 + }, + { + "epoch": 3.8436992969172525, + "grad_norm": 2.390625, + "learning_rate": 0.004409788468618662, + "loss": 2.8144, + "mean_token_accuracy": 0.43682438135147095, + "num_tokens": 7265792090.0, + "step": 14214 + }, + { + "epoch": 3.843969713358572, + "grad_norm": 3.1875, + "learning_rate": 0.00440871491669165, + "loss": 2.8244, + "mean_token_accuracy": 0.4436747133731842, + "num_tokens": 7266296930.0, + "step": 14215 + }, + { + "epoch": 3.844240129799892, + "grad_norm": 3.359375, + "learning_rate": 0.004407641567004647, + "loss": 2.7752, + "mean_token_accuracy": 0.44421836733818054, + "num_tokens": 7266821039.0, + "step": 14216 + }, + { + "epoch": 3.8445105462412115, + "grad_norm": 2.609375, + "learning_rate": 0.004406568419590584, + "loss": 2.8123, + "mean_token_accuracy": 0.42421162128448486, + "num_tokens": 7267345326.0, + "step": 14217 + }, + { + "epoch": 3.844780962682531, + "grad_norm": 2.8125, + "learning_rate": 0.004405495474482396, + "loss": 3.0335, + "mean_token_accuracy": 0.41634637117385864, + "num_tokens": 7267869492.0, + "step": 14218 + }, + { + "epoch": 3.8450513791238508, + "grad_norm": 3.640625, + "learning_rate": 0.004404422731712999, + "loss": 2.8647, + "mean_token_accuracy": 0.4329427480697632, + "num_tokens": 7268387746.0, + "step": 14219 + }, + { + "epoch": 3.8453217955651704, + "grad_norm": 2.9375, + "learning_rate": 0.004403350191315308, + "loss": 3.0378, + "mean_token_accuracy": 0.41942089796066284, + "num_tokens": 7268907566.0, + "step": 14220 + }, + { + "epoch": 3.84559221200649, + "grad_norm": 3.453125, + "learning_rate": 0.004402277853322233, + "loss": 2.9639, + "mean_token_accuracy": 0.41334423422813416, + "num_tokens": 7269431825.0, + "step": 14221 + }, + { + "epoch": 3.8458626284478097, + "grad_norm": 2.6875, + "learning_rate": 0.004401205717766678, + "loss": 2.7149, + "mean_token_accuracy": 0.43577590584754944, + "num_tokens": 7269956056.0, + "step": 14222 + }, + { + "epoch": 3.8461330448891293, + "grad_norm": 3.046875, + "learning_rate": 0.004400133784681533, + "loss": 2.8055, + "mean_token_accuracy": 0.43379780650138855, + "num_tokens": 7270480243.0, + "step": 14223 + }, + { + "epoch": 3.846403461330449, + "grad_norm": 3.203125, + "learning_rate": 0.004399062054099694, + "loss": 2.8349, + "mean_token_accuracy": 0.4104585349559784, + "num_tokens": 7270998105.0, + "step": 14224 + }, + { + "epoch": 3.8466738777717686, + "grad_norm": 2.65625, + "learning_rate": 0.004397990526054045, + "loss": 2.6953, + "mean_token_accuracy": 0.4545285105705261, + "num_tokens": 7271522263.0, + "step": 14225 + }, + { + "epoch": 3.8469442942130883, + "grad_norm": 3.234375, + "learning_rate": 0.004396919200577459, + "loss": 2.979, + "mean_token_accuracy": 0.4342137277126312, + "num_tokens": 7272046543.0, + "step": 14226 + }, + { + "epoch": 3.847214710654408, + "grad_norm": 3.59375, + "learning_rate": 0.004395848077702813, + "loss": 2.6992, + "mean_token_accuracy": 0.44503623247146606, + "num_tokens": 7272570812.0, + "step": 14227 + }, + { + "epoch": 3.8474851270957275, + "grad_norm": 3.84375, + "learning_rate": 0.004394777157462968, + "loss": 2.5937, + "mean_token_accuracy": 0.43888258934020996, + "num_tokens": 7273095052.0, + "step": 14228 + }, + { + "epoch": 3.847755543537047, + "grad_norm": 2.4375, + "learning_rate": 0.004393706439890786, + "loss": 2.9473, + "mean_token_accuracy": 0.4282846450805664, + "num_tokens": 7273571440.0, + "step": 14229 + }, + { + "epoch": 3.848025959978367, + "grad_norm": 2.828125, + "learning_rate": 0.004392635925019118, + "loss": 2.8819, + "mean_token_accuracy": 0.43673279881477356, + "num_tokens": 7274095700.0, + "step": 14230 + }, + { + "epoch": 3.8482963764196865, + "grad_norm": 18.0, + "learning_rate": 0.0043915656128808095, + "loss": 10.0013, + "mean_token_accuracy": 0.007588438224047422, + "num_tokens": 7274581674.0, + "step": 14231 + }, + { + "epoch": 3.848566792861006, + "grad_norm": 7.0, + "learning_rate": 0.004390495503508704, + "loss": 2.9713, + "mean_token_accuracy": 0.40053272247314453, + "num_tokens": 7275105844.0, + "step": 14232 + }, + { + "epoch": 3.8488372093023253, + "grad_norm": 2.34375, + "learning_rate": 0.004389425596935634, + "loss": 3.017, + "mean_token_accuracy": 0.4362758994102478, + "num_tokens": 7275580237.0, + "step": 14233 + }, + { + "epoch": 3.8491076257436454, + "grad_norm": 3.734375, + "learning_rate": 0.004388355893194425, + "loss": 2.8302, + "mean_token_accuracy": 0.47807106375694275, + "num_tokens": 7276104497.0, + "step": 14234 + }, + { + "epoch": 3.8493780421849646, + "grad_norm": 5.0, + "learning_rate": 0.004387286392317903, + "loss": 2.6869, + "mean_token_accuracy": 0.49324214458465576, + "num_tokens": 7276585352.0, + "step": 14235 + }, + { + "epoch": 3.8496484586262847, + "grad_norm": 2.75, + "learning_rate": 0.004386217094338881, + "loss": 2.7925, + "mean_token_accuracy": 0.425970196723938, + "num_tokens": 7277109542.0, + "step": 14236 + }, + { + "epoch": 3.849918875067604, + "grad_norm": 3.1875, + "learning_rate": 0.004385147999290165, + "loss": 2.7949, + "mean_token_accuracy": 0.43892788887023926, + "num_tokens": 7277633821.0, + "step": 14237 + }, + { + "epoch": 3.850189291508924, + "grad_norm": 3.078125, + "learning_rate": 0.004384079107204565, + "loss": 2.7062, + "mean_token_accuracy": 0.4328644871711731, + "num_tokens": 7278146397.0, + "step": 14238 + }, + { + "epoch": 3.850459707950243, + "grad_norm": 2.78125, + "learning_rate": 0.004383010418114871, + "loss": 2.6748, + "mean_token_accuracy": 0.4453961253166199, + "num_tokens": 7278670592.0, + "step": 14239 + }, + { + "epoch": 3.8507301243915633, + "grad_norm": 3.328125, + "learning_rate": 0.004381941932053877, + "loss": 2.7954, + "mean_token_accuracy": 0.4251675605773926, + "num_tokens": 7279194788.0, + "step": 14240 + }, + { + "epoch": 3.8510005408328825, + "grad_norm": 3.171875, + "learning_rate": 0.00438087364905437, + "loss": 2.8088, + "mean_token_accuracy": 0.43149715662002563, + "num_tokens": 7279718960.0, + "step": 14241 + }, + { + "epoch": 3.8512709572742025, + "grad_norm": 3.171875, + "learning_rate": 0.004379805569149125, + "loss": 2.9727, + "mean_token_accuracy": 0.429193913936615, + "num_tokens": 7280243132.0, + "step": 14242 + }, + { + "epoch": 3.8515413737155217, + "grad_norm": 3.375, + "learning_rate": 0.00437873769237091, + "loss": 3.1524, + "mean_token_accuracy": 0.3824383616447449, + "num_tokens": 7280767400.0, + "step": 14243 + }, + { + "epoch": 3.8518117901568414, + "grad_norm": 2.6875, + "learning_rate": 0.004377670018752499, + "loss": 2.9273, + "mean_token_accuracy": 0.42828863859176636, + "num_tokens": 7281291616.0, + "step": 14244 + }, + { + "epoch": 3.852082206598161, + "grad_norm": 3.78125, + "learning_rate": 0.004376602548326645, + "loss": 2.9578, + "mean_token_accuracy": 0.42938369512557983, + "num_tokens": 7281796109.0, + "step": 14245 + }, + { + "epoch": 3.8523526230394807, + "grad_norm": 2.625, + "learning_rate": 0.004375535281126101, + "loss": 2.8749, + "mean_token_accuracy": 0.4421338737010956, + "num_tokens": 7282266236.0, + "step": 14246 + }, + { + "epoch": 3.8526230394808003, + "grad_norm": 3.515625, + "learning_rate": 0.0043744682171836195, + "loss": 2.9302, + "mean_token_accuracy": 0.4281005263328552, + "num_tokens": 7282774093.0, + "step": 14247 + }, + { + "epoch": 3.85289345592212, + "grad_norm": 2.59375, + "learning_rate": 0.004373401356531937, + "loss": 2.7918, + "mean_token_accuracy": 0.4375387132167816, + "num_tokens": 7283298347.0, + "step": 14248 + }, + { + "epoch": 3.8531638723634396, + "grad_norm": 3.59375, + "learning_rate": 0.004372334699203785, + "loss": 3.0258, + "mean_token_accuracy": 0.417888879776001, + "num_tokens": 7283822609.0, + "step": 14249 + }, + { + "epoch": 3.8534342888047592, + "grad_norm": 3.265625, + "learning_rate": 0.0043712682452318975, + "loss": 2.8745, + "mean_token_accuracy": 0.4059252142906189, + "num_tokens": 7284346825.0, + "step": 14250 + }, + { + "epoch": 3.853704705246079, + "grad_norm": 9.1875, + "learning_rate": 0.004370201994648992, + "loss": 10.8334, + "mean_token_accuracy": 0.00814130436629057, + "num_tokens": 7284768464.0, + "step": 14251 + }, + { + "epoch": 3.8539751216873985, + "grad_norm": 6.0625, + "learning_rate": 0.004369135947487789, + "loss": 3.0468, + "mean_token_accuracy": 0.4135618805885315, + "num_tokens": 7285255051.0, + "step": 14252 + }, + { + "epoch": 3.854245538128718, + "grad_norm": 2.421875, + "learning_rate": 0.004368070103780993, + "loss": 2.8107, + "mean_token_accuracy": 0.4462772607803345, + "num_tokens": 7285755623.0, + "step": 14253 + }, + { + "epoch": 3.854515954570038, + "grad_norm": 2.828125, + "learning_rate": 0.004367004463561309, + "loss": 2.8067, + "mean_token_accuracy": 0.4365839958190918, + "num_tokens": 7286279687.0, + "step": 14254 + }, + { + "epoch": 3.8547863710113575, + "grad_norm": 9.75, + "learning_rate": 0.004365939026861436, + "loss": 2.6279, + "mean_token_accuracy": 0.49777257442474365, + "num_tokens": 7286803893.0, + "step": 14255 + }, + { + "epoch": 3.855056787452677, + "grad_norm": 3.203125, + "learning_rate": 0.004364873793714065, + "loss": 3.022, + "mean_token_accuracy": 0.4138649106025696, + "num_tokens": 7287328140.0, + "step": 14256 + }, + { + "epoch": 3.8553272038939967, + "grad_norm": 2.65625, + "learning_rate": 0.004363808764151876, + "loss": 2.8908, + "mean_token_accuracy": 0.4254063367843628, + "num_tokens": 7287852261.0, + "step": 14257 + }, + { + "epoch": 3.8555976203353164, + "grad_norm": 2.84375, + "learning_rate": 0.004362743938207552, + "loss": 2.9203, + "mean_token_accuracy": 0.42604586482048035, + "num_tokens": 7288376446.0, + "step": 14258 + }, + { + "epoch": 3.855868036776636, + "grad_norm": 3.65625, + "learning_rate": 0.0043616793159137615, + "loss": 2.9878, + "mean_token_accuracy": 0.4387587010860443, + "num_tokens": 7288877901.0, + "step": 14259 + }, + { + "epoch": 3.8561384532179557, + "grad_norm": 7.28125, + "learning_rate": 0.00436061489730317, + "loss": 2.6922, + "mean_token_accuracy": 0.4783703088760376, + "num_tokens": 7289402097.0, + "step": 14260 + }, + { + "epoch": 3.8564088696592753, + "grad_norm": 2.328125, + "learning_rate": 0.004359550682408442, + "loss": 2.9224, + "mean_token_accuracy": 0.4216580390930176, + "num_tokens": 7289926376.0, + "step": 14261 + }, + { + "epoch": 3.856679286100595, + "grad_norm": 3.953125, + "learning_rate": 0.004358486671262227, + "loss": 3.0853, + "mean_token_accuracy": 0.4181823134422302, + "num_tokens": 7290450578.0, + "step": 14262 + }, + { + "epoch": 3.8569497025419146, + "grad_norm": 3.21875, + "learning_rate": 0.004357422863897169, + "loss": 2.8566, + "mean_token_accuracy": 0.4364641308784485, + "num_tokens": 7290944470.0, + "step": 14263 + }, + { + "epoch": 3.8572201189832342, + "grad_norm": 3.84375, + "learning_rate": 0.004356359260345916, + "loss": 2.8932, + "mean_token_accuracy": 0.42978179454803467, + "num_tokens": 7291433676.0, + "step": 14264 + }, + { + "epoch": 3.857490535424554, + "grad_norm": 2.78125, + "learning_rate": 0.004355295860641097, + "loss": 2.9103, + "mean_token_accuracy": 0.44978296756744385, + "num_tokens": 7291948630.0, + "step": 14265 + }, + { + "epoch": 3.8577609518658735, + "grad_norm": 3.859375, + "learning_rate": 0.0043542326648153395, + "loss": 2.8779, + "mean_token_accuracy": 0.4360175132751465, + "num_tokens": 7292433999.0, + "step": 14266 + }, + { + "epoch": 3.858031368307193, + "grad_norm": 2.765625, + "learning_rate": 0.004353169672901268, + "loss": 2.9148, + "mean_token_accuracy": 0.4151359796524048, + "num_tokens": 7292958152.0, + "step": 14267 + }, + { + "epoch": 3.858301784748513, + "grad_norm": 3.515625, + "learning_rate": 0.004352106884931502, + "loss": 2.7303, + "mean_token_accuracy": 0.4437183737754822, + "num_tokens": 7293481613.0, + "step": 14268 + }, + { + "epoch": 3.8585722011898325, + "grad_norm": 2.546875, + "learning_rate": 0.004351044300938643, + "loss": 2.9173, + "mean_token_accuracy": 0.4367092549800873, + "num_tokens": 7294005705.0, + "step": 14269 + }, + { + "epoch": 3.858842617631152, + "grad_norm": 3.015625, + "learning_rate": 0.0043499819209553, + "loss": 2.7814, + "mean_token_accuracy": 0.43103116750717163, + "num_tokens": 7294529950.0, + "step": 14270 + }, + { + "epoch": 3.8591130340724717, + "grad_norm": 5.625, + "learning_rate": 0.004348919745014069, + "loss": 9.945, + "mean_token_accuracy": 0.00010648691386450082, + "num_tokens": 7295054086.0, + "step": 14271 + }, + { + "epoch": 3.8593834505137914, + "grad_norm": 5.59375, + "learning_rate": 0.004347857773147537, + "loss": 2.8336, + "mean_token_accuracy": 0.41280996799468994, + "num_tokens": 7295578255.0, + "step": 14272 + }, + { + "epoch": 3.859653866955111, + "grad_norm": 2.40625, + "learning_rate": 0.004346796005388295, + "loss": 2.7089, + "mean_token_accuracy": 0.445355087518692, + "num_tokens": 7296101050.0, + "step": 14273 + }, + { + "epoch": 3.8599242833964302, + "grad_norm": 2.96875, + "learning_rate": 0.0043457344417689135, + "loss": 3.0064, + "mean_token_accuracy": 0.4267231225967407, + "num_tokens": 7296625169.0, + "step": 14274 + }, + { + "epoch": 3.8601946998377503, + "grad_norm": 3.203125, + "learning_rate": 0.004344673082321971, + "loss": 2.8551, + "mean_token_accuracy": 0.4171689450740814, + "num_tokens": 7297149423.0, + "step": 14275 + }, + { + "epoch": 3.8604651162790695, + "grad_norm": 50.75, + "learning_rate": 0.004343611927080031, + "loss": 2.8954, + "mean_token_accuracy": 0.44314470887184143, + "num_tokens": 7297665859.0, + "step": 14276 + }, + { + "epoch": 3.8607355327203896, + "grad_norm": 4.875, + "learning_rate": 0.004342550976075649, + "loss": 2.9306, + "mean_token_accuracy": 0.42606890201568604, + "num_tokens": 7298190071.0, + "step": 14277 + }, + { + "epoch": 3.861005949161709, + "grad_norm": 2.40625, + "learning_rate": 0.004341490229341383, + "loss": 2.8652, + "mean_token_accuracy": 0.42732593417167664, + "num_tokens": 7298714270.0, + "step": 14278 + }, + { + "epoch": 3.861276365603029, + "grad_norm": 3.359375, + "learning_rate": 0.004340429686909779, + "loss": 2.9443, + "mean_token_accuracy": 0.4278741478919983, + "num_tokens": 7299238548.0, + "step": 14279 + }, + { + "epoch": 3.861546782044348, + "grad_norm": 2.9375, + "learning_rate": 0.004339369348813375, + "loss": 2.8497, + "mean_token_accuracy": 0.42090460658073425, + "num_tokens": 7299762805.0, + "step": 14280 + }, + { + "epoch": 3.861817198485668, + "grad_norm": 3.453125, + "learning_rate": 0.004338309215084706, + "loss": 2.8495, + "mean_token_accuracy": 0.42433497309684753, + "num_tokens": 7300287073.0, + "step": 14281 + }, + { + "epoch": 3.8620876149269874, + "grad_norm": 2.828125, + "learning_rate": 0.004337249285756304, + "loss": 2.8003, + "mean_token_accuracy": 0.44902098178863525, + "num_tokens": 7300765159.0, + "step": 14282 + }, + { + "epoch": 3.8623580313683075, + "grad_norm": 2.90625, + "learning_rate": 0.004336189560860686, + "loss": 2.6754, + "mean_token_accuracy": 0.44859036803245544, + "num_tokens": 7301262909.0, + "step": 14283 + }, + { + "epoch": 3.8626284478096267, + "grad_norm": 2.921875, + "learning_rate": 0.00433513004043037, + "loss": 2.9878, + "mean_token_accuracy": 0.4282683730125427, + "num_tokens": 7301787186.0, + "step": 14284 + }, + { + "epoch": 3.8628988642509463, + "grad_norm": 4.5, + "learning_rate": 0.004334070724497866, + "loss": 3.1031, + "mean_token_accuracy": 0.39745521545410156, + "num_tokens": 7302311410.0, + "step": 14285 + }, + { + "epoch": 3.863169280692266, + "grad_norm": 3.28125, + "learning_rate": 0.004333011613095672, + "loss": 2.8976, + "mean_token_accuracy": 0.4186916649341583, + "num_tokens": 7302835589.0, + "step": 14286 + }, + { + "epoch": 3.8634396971335856, + "grad_norm": 3.234375, + "learning_rate": 0.0043319527062562905, + "loss": 3.0564, + "mean_token_accuracy": 0.40695685148239136, + "num_tokens": 7303359740.0, + "step": 14287 + }, + { + "epoch": 3.8637101135749052, + "grad_norm": 2.984375, + "learning_rate": 0.00433089400401221, + "loss": 2.8457, + "mean_token_accuracy": 0.4247191250324249, + "num_tokens": 7303884015.0, + "step": 14288 + }, + { + "epoch": 3.863980530016225, + "grad_norm": 2.65625, + "learning_rate": 0.004329835506395912, + "loss": 3.0404, + "mean_token_accuracy": 0.4165763258934021, + "num_tokens": 7304408060.0, + "step": 14289 + }, + { + "epoch": 3.8642509464575445, + "grad_norm": 3.359375, + "learning_rate": 0.004328777213439878, + "loss": 2.5857, + "mean_token_accuracy": 0.45587605237960815, + "num_tokens": 7304932342.0, + "step": 14290 + }, + { + "epoch": 3.864521362898864, + "grad_norm": 10.5625, + "learning_rate": 0.004327719125176578, + "loss": 10.3571, + "mean_token_accuracy": 0.0516408234834671, + "num_tokens": 7305456552.0, + "step": 14291 + }, + { + "epoch": 3.864791779340184, + "grad_norm": 8.75, + "learning_rate": 0.004326661241638475, + "loss": 2.8967, + "mean_token_accuracy": 0.4442041516304016, + "num_tokens": 7305980820.0, + "step": 14292 + }, + { + "epoch": 3.8650621957815035, + "grad_norm": 4.40625, + "learning_rate": 0.004325603562858032, + "loss": 2.8773, + "mean_token_accuracy": 0.43703973293304443, + "num_tokens": 7306505035.0, + "step": 14293 + }, + { + "epoch": 3.865332612222823, + "grad_norm": 2.84375, + "learning_rate": 0.0043245460888676965, + "loss": 2.9424, + "mean_token_accuracy": 0.4324771463871002, + "num_tokens": 7307029317.0, + "step": 14294 + }, + { + "epoch": 3.8656030286641427, + "grad_norm": 3.828125, + "learning_rate": 0.004323488819699922, + "loss": 2.946, + "mean_token_accuracy": 0.43082988262176514, + "num_tokens": 7307515797.0, + "step": 14295 + }, + { + "epoch": 3.8658734451054624, + "grad_norm": 2.765625, + "learning_rate": 0.0043224317553871424, + "loss": 2.6793, + "mean_token_accuracy": 0.4569293260574341, + "num_tokens": 7308035351.0, + "step": 14296 + }, + { + "epoch": 3.866143861546782, + "grad_norm": 4.0, + "learning_rate": 0.004321374895961794, + "loss": 3.0451, + "mean_token_accuracy": 0.4199381470680237, + "num_tokens": 7308497893.0, + "step": 14297 + }, + { + "epoch": 3.8664142779881017, + "grad_norm": 2.015625, + "learning_rate": 0.004320318241456307, + "loss": 2.8015, + "mean_token_accuracy": 0.43854671716690063, + "num_tokens": 7309022075.0, + "step": 14298 + }, + { + "epoch": 3.8666846944294213, + "grad_norm": 3.078125, + "learning_rate": 0.0043192617919031, + "loss": 2.9143, + "mean_token_accuracy": 0.4179422855377197, + "num_tokens": 7309546339.0, + "step": 14299 + }, + { + "epoch": 3.866955110870741, + "grad_norm": 2.59375, + "learning_rate": 0.004318205547334584, + "loss": 2.8917, + "mean_token_accuracy": 0.4370725750923157, + "num_tokens": 7310022994.0, + "step": 14300 + }, + { + "epoch": 3.8672255273120606, + "grad_norm": 3.046875, + "learning_rate": 0.004317149507783175, + "loss": 3.0222, + "mean_token_accuracy": 0.43175971508026123, + "num_tokens": 7310547135.0, + "step": 14301 + }, + { + "epoch": 3.8674959437533802, + "grad_norm": 3.015625, + "learning_rate": 0.004316093673281273, + "loss": 3.0181, + "mean_token_accuracy": 0.41134917736053467, + "num_tokens": 7311071405.0, + "step": 14302 + }, + { + "epoch": 3.8677663601947, + "grad_norm": 2.859375, + "learning_rate": 0.00431503804386127, + "loss": 2.6046, + "mean_token_accuracy": 0.4449309706687927, + "num_tokens": 7311595577.0, + "step": 14303 + }, + { + "epoch": 3.8680367766360195, + "grad_norm": 2.546875, + "learning_rate": 0.004313982619555562, + "loss": 2.8803, + "mean_token_accuracy": 0.430259644985199, + "num_tokens": 7312119134.0, + "step": 14304 + }, + { + "epoch": 3.868307193077339, + "grad_norm": 3.109375, + "learning_rate": 0.004312927400396529, + "loss": 2.8024, + "mean_token_accuracy": 0.43581467866897583, + "num_tokens": 7312643270.0, + "step": 14305 + }, + { + "epoch": 3.868577609518659, + "grad_norm": 4.125, + "learning_rate": 0.004311872386416547, + "loss": 2.9393, + "mean_token_accuracy": 0.4289678931236267, + "num_tokens": 7313167516.0, + "step": 14306 + }, + { + "epoch": 3.8688480259599785, + "grad_norm": 2.875, + "learning_rate": 0.00431081757764799, + "loss": 2.9269, + "mean_token_accuracy": 0.4052991271018982, + "num_tokens": 7313639378.0, + "step": 14307 + }, + { + "epoch": 3.869118442401298, + "grad_norm": 2.9375, + "learning_rate": 0.0043097629741232196, + "loss": 2.8233, + "mean_token_accuracy": 0.42793989181518555, + "num_tokens": 7314163605.0, + "step": 14308 + }, + { + "epoch": 3.8693888588426177, + "grad_norm": 2.703125, + "learning_rate": 0.004308708575874598, + "loss": 2.7625, + "mean_token_accuracy": 0.4461396634578705, + "num_tokens": 7314687755.0, + "step": 14309 + }, + { + "epoch": 3.8696592752839374, + "grad_norm": 3.15625, + "learning_rate": 0.0043076543829344725, + "loss": 2.7618, + "mean_token_accuracy": 0.43394121527671814, + "num_tokens": 7315211994.0, + "step": 14310 + }, + { + "epoch": 3.869929691725257, + "grad_norm": 3.484375, + "learning_rate": 0.004306600395335194, + "loss": 9.3026, + "mean_token_accuracy": 0.006998810451477766, + "num_tokens": 7315736153.0, + "step": 14311 + }, + { + "epoch": 3.8702001081665767, + "grad_norm": 6.625, + "learning_rate": 0.004305546613109097, + "loss": 2.8214, + "mean_token_accuracy": 0.4174489974975586, + "num_tokens": 7316260367.0, + "step": 14312 + }, + { + "epoch": 3.8704705246078963, + "grad_norm": 2.375, + "learning_rate": 0.00430449303628852, + "loss": 2.9062, + "mean_token_accuracy": 0.4335569143295288, + "num_tokens": 7316784645.0, + "step": 14313 + }, + { + "epoch": 3.870740941049216, + "grad_norm": 2.90625, + "learning_rate": 0.004303439664905786, + "loss": 3.0182, + "mean_token_accuracy": 0.42854043841362, + "num_tokens": 7317287597.0, + "step": 14314 + }, + { + "epoch": 3.871011357490535, + "grad_norm": 3.015625, + "learning_rate": 0.004302386498993214, + "loss": 2.9044, + "mean_token_accuracy": 0.42588287591934204, + "num_tokens": 7317811765.0, + "step": 14315 + }, + { + "epoch": 3.8712817739318552, + "grad_norm": 3.453125, + "learning_rate": 0.004301333538583124, + "loss": 2.9514, + "mean_token_accuracy": 0.41186630725860596, + "num_tokens": 7318336046.0, + "step": 14316 + }, + { + "epoch": 3.8715521903731744, + "grad_norm": 3.109375, + "learning_rate": 0.004300280783707819, + "loss": 2.8551, + "mean_token_accuracy": 0.4295424818992615, + "num_tokens": 7318860241.0, + "step": 14317 + }, + { + "epoch": 3.8718226068144945, + "grad_norm": 3.21875, + "learning_rate": 0.004299228234399602, + "loss": 2.8728, + "mean_token_accuracy": 0.42793726921081543, + "num_tokens": 7319384451.0, + "step": 14318 + }, + { + "epoch": 3.8720930232558137, + "grad_norm": 2.65625, + "learning_rate": 0.004298175890690769, + "loss": 2.8186, + "mean_token_accuracy": 0.4564168453216553, + "num_tokens": 7319899872.0, + "step": 14319 + }, + { + "epoch": 3.872363439697134, + "grad_norm": 3.328125, + "learning_rate": 0.004297123752613607, + "loss": 2.774, + "mean_token_accuracy": 0.44125011563301086, + "num_tokens": 7320424138.0, + "step": 14320 + }, + { + "epoch": 3.872633856138453, + "grad_norm": 2.953125, + "learning_rate": 0.004296071820200402, + "loss": 2.878, + "mean_token_accuracy": 0.41789817810058594, + "num_tokens": 7320908337.0, + "step": 14321 + }, + { + "epoch": 3.872904272579773, + "grad_norm": 3.90625, + "learning_rate": 0.0042950200934834286, + "loss": 3.0572, + "mean_token_accuracy": 0.4227232336997986, + "num_tokens": 7321400008.0, + "step": 14322 + }, + { + "epoch": 3.8731746890210923, + "grad_norm": 4.4375, + "learning_rate": 0.004293968572494954, + "loss": 2.7106, + "mean_token_accuracy": 0.44520795345306396, + "num_tokens": 7321924278.0, + "step": 14323 + }, + { + "epoch": 3.8734451054624124, + "grad_norm": 2.59375, + "learning_rate": 0.004292917257267245, + "loss": 2.9411, + "mean_token_accuracy": 0.43063217401504517, + "num_tokens": 7322448338.0, + "step": 14324 + }, + { + "epoch": 3.8737155219037316, + "grad_norm": 4.5, + "learning_rate": 0.004291866147832561, + "loss": 2.9451, + "mean_token_accuracy": 0.41260626912117004, + "num_tokens": 7322972237.0, + "step": 14325 + }, + { + "epoch": 3.8739859383450512, + "grad_norm": 3.5, + "learning_rate": 0.004290815244223149, + "loss": 2.5318, + "mean_token_accuracy": 0.5082193613052368, + "num_tokens": 7323414101.0, + "step": 14326 + }, + { + "epoch": 3.874256354786371, + "grad_norm": 2.84375, + "learning_rate": 0.004289764546471259, + "loss": 2.9381, + "mean_token_accuracy": 0.4268510341644287, + "num_tokens": 7323911068.0, + "step": 14327 + }, + { + "epoch": 3.8745267712276905, + "grad_norm": 61.75, + "learning_rate": 0.004288714054609124, + "loss": 3.1626, + "mean_token_accuracy": 0.3983551859855652, + "num_tokens": 7324384446.0, + "step": 14328 + }, + { + "epoch": 3.87479718766901, + "grad_norm": 7.25, + "learning_rate": 0.004287663768668978, + "loss": 3.0522, + "mean_token_accuracy": 0.4141209125518799, + "num_tokens": 7324887569.0, + "step": 14329 + }, + { + "epoch": 3.87506760411033, + "grad_norm": 2.140625, + "learning_rate": 0.004286613688683049, + "loss": 2.9897, + "mean_token_accuracy": 0.42305830121040344, + "num_tokens": 7325362464.0, + "step": 14330 + }, + { + "epoch": 3.8753380205516494, + "grad_norm": 11.0625, + "learning_rate": 0.004285563814683554, + "loss": 9.6864, + "mean_token_accuracy": 0.015179854817688465, + "num_tokens": 7325886494.0, + "step": 14331 + }, + { + "epoch": 3.875608436992969, + "grad_norm": 7.9375, + "learning_rate": 0.004284514146702706, + "loss": 3.0134, + "mean_token_accuracy": 0.42865943908691406, + "num_tokens": 7326410640.0, + "step": 14332 + }, + { + "epoch": 3.8758788534342887, + "grad_norm": 3.78125, + "learning_rate": 0.004283464684772714, + "loss": 2.9665, + "mean_token_accuracy": 0.41320836544036865, + "num_tokens": 7326934834.0, + "step": 14333 + }, + { + "epoch": 3.8761492698756084, + "grad_norm": 2.453125, + "learning_rate": 0.004282415428925779, + "loss": 2.9567, + "mean_token_accuracy": 0.4320492148399353, + "num_tokens": 7327421080.0, + "step": 14334 + }, + { + "epoch": 3.876419686316928, + "grad_norm": 3.421875, + "learning_rate": 0.00428136637919409, + "loss": 3.019, + "mean_token_accuracy": 0.4238821268081665, + "num_tokens": 7327945358.0, + "step": 14335 + }, + { + "epoch": 3.8766901027582477, + "grad_norm": 3.5, + "learning_rate": 0.0042803175356098426, + "loss": 2.9538, + "mean_token_accuracy": 0.4102921485900879, + "num_tokens": 7328469570.0, + "step": 14336 + }, + { + "epoch": 3.8769605191995673, + "grad_norm": 2.71875, + "learning_rate": 0.004279268898205211, + "loss": 2.9191, + "mean_token_accuracy": 0.4516311287879944, + "num_tokens": 7328945503.0, + "step": 14337 + }, + { + "epoch": 3.877230935640887, + "grad_norm": 3.75, + "learning_rate": 0.004278220467012376, + "loss": 2.8748, + "mean_token_accuracy": 0.44131067395210266, + "num_tokens": 7329435295.0, + "step": 14338 + }, + { + "epoch": 3.8775013520822066, + "grad_norm": 10.1875, + "learning_rate": 0.004277172242063503, + "loss": 2.6951, + "mean_token_accuracy": 0.4718126654624939, + "num_tokens": 7329936005.0, + "step": 14339 + }, + { + "epoch": 3.8777717685235262, + "grad_norm": 2.671875, + "learning_rate": 0.004276124223390758, + "loss": 2.6009, + "mean_token_accuracy": 0.4365173876285553, + "num_tokens": 7330460151.0, + "step": 14340 + }, + { + "epoch": 3.878042184964846, + "grad_norm": 2.765625, + "learning_rate": 0.004275076411026294, + "loss": 2.8945, + "mean_token_accuracy": 0.4273530840873718, + "num_tokens": 7330984366.0, + "step": 14341 + }, + { + "epoch": 3.8783126014061655, + "grad_norm": 2.875, + "learning_rate": 0.0042740288050022645, + "loss": 2.7067, + "mean_token_accuracy": 0.43934255838394165, + "num_tokens": 7331508650.0, + "step": 14342 + }, + { + "epoch": 3.878583017847485, + "grad_norm": 3.234375, + "learning_rate": 0.004272981405350808, + "loss": 2.808, + "mean_token_accuracy": 0.42076432704925537, + "num_tokens": 7332032731.0, + "step": 14343 + }, + { + "epoch": 3.878853434288805, + "grad_norm": 2.90625, + "learning_rate": 0.004271934212104068, + "loss": 2.9831, + "mean_token_accuracy": 0.4194585084915161, + "num_tokens": 7332557013.0, + "step": 14344 + }, + { + "epoch": 3.8791238507301244, + "grad_norm": 3.40625, + "learning_rate": 0.0042708872252941715, + "loss": 2.9746, + "mean_token_accuracy": 0.4302701950073242, + "num_tokens": 7333056716.0, + "step": 14345 + }, + { + "epoch": 3.879394267171444, + "grad_norm": 3.15625, + "learning_rate": 0.004269840444953241, + "loss": 2.9551, + "mean_token_accuracy": 0.42554277181625366, + "num_tokens": 7333580920.0, + "step": 14346 + }, + { + "epoch": 3.8796646836127637, + "grad_norm": 3.46875, + "learning_rate": 0.004268793871113401, + "loss": 2.9266, + "mean_token_accuracy": 0.43066027760505676, + "num_tokens": 7334105100.0, + "step": 14347 + }, + { + "epoch": 3.8799351000540834, + "grad_norm": 3.203125, + "learning_rate": 0.004267747503806762, + "loss": 2.8823, + "mean_token_accuracy": 0.4207450747489929, + "num_tokens": 7334629360.0, + "step": 14348 + }, + { + "epoch": 3.880205516495403, + "grad_norm": 3.234375, + "learning_rate": 0.004266701343065422, + "loss": 2.835, + "mean_token_accuracy": 0.43759870529174805, + "num_tokens": 7335153585.0, + "step": 14349 + }, + { + "epoch": 3.8804759329367227, + "grad_norm": 2.953125, + "learning_rate": 0.0042656553889214915, + "loss": 2.7609, + "mean_token_accuracy": 0.4255692958831787, + "num_tokens": 7335677797.0, + "step": 14350 + }, + { + "epoch": 3.8807463493780423, + "grad_norm": 27.25, + "learning_rate": 0.004264609641407053, + "loss": 10.4371, + "mean_token_accuracy": 0.0035477972123771906, + "num_tokens": 7336201877.0, + "step": 14351 + }, + { + "epoch": 3.881016765819362, + "grad_norm": 6.28125, + "learning_rate": 0.004263564100554204, + "loss": 2.934, + "mean_token_accuracy": 0.4155341684818268, + "num_tokens": 7336726127.0, + "step": 14352 + }, + { + "epoch": 3.8812871822606816, + "grad_norm": 2.1875, + "learning_rate": 0.004262518766395014, + "loss": 3.0161, + "mean_token_accuracy": 0.3865884840488434, + "num_tokens": 7337250335.0, + "step": 14353 + }, + { + "epoch": 3.8815575987020012, + "grad_norm": 4.4375, + "learning_rate": 0.004261473638961565, + "loss": 2.6661, + "mean_token_accuracy": 0.45510292053222656, + "num_tokens": 7337774607.0, + "step": 14354 + }, + { + "epoch": 3.881828015143321, + "grad_norm": 2.59375, + "learning_rate": 0.00426042871828592, + "loss": 2.9759, + "mean_token_accuracy": 0.43681687116622925, + "num_tokens": 7338234944.0, + "step": 14355 + }, + { + "epoch": 3.88209843158464, + "grad_norm": 3.0625, + "learning_rate": 0.004259384004400142, + "loss": 3.0139, + "mean_token_accuracy": 0.41549569368362427, + "num_tokens": 7338727176.0, + "step": 14356 + }, + { + "epoch": 3.88236884802596, + "grad_norm": 3.484375, + "learning_rate": 0.004258339497336287, + "loss": 3.1262, + "mean_token_accuracy": 0.41225361824035645, + "num_tokens": 7339251443.0, + "step": 14357 + }, + { + "epoch": 3.8826392644672794, + "grad_norm": 3.125, + "learning_rate": 0.0042572951971264, + "loss": 2.7763, + "mean_token_accuracy": 0.4460830092430115, + "num_tokens": 7339775643.0, + "step": 14358 + }, + { + "epoch": 3.8829096809085994, + "grad_norm": 3.0625, + "learning_rate": 0.004256251103802526, + "loss": 3.0065, + "mean_token_accuracy": 0.4286293089389801, + "num_tokens": 7340257227.0, + "step": 14359 + }, + { + "epoch": 3.8831800973499186, + "grad_norm": 15.9375, + "learning_rate": 0.004255207217396702, + "loss": 2.8069, + "mean_token_accuracy": 0.4341283440589905, + "num_tokens": 7340751244.0, + "step": 14360 + }, + { + "epoch": 3.8834505137912387, + "grad_norm": 3.75, + "learning_rate": 0.004254163537940953, + "loss": 2.9276, + "mean_token_accuracy": 0.4117571711540222, + "num_tokens": 7341275518.0, + "step": 14361 + }, + { + "epoch": 3.883720930232558, + "grad_norm": 1.90625, + "learning_rate": 0.004253120065467306, + "loss": 2.8406, + "mean_token_accuracy": 0.4394378066062927, + "num_tokens": 7341726620.0, + "step": 14362 + }, + { + "epoch": 3.883991346673878, + "grad_norm": 2.796875, + "learning_rate": 0.004252076800007778, + "loss": 2.9129, + "mean_token_accuracy": 0.4353440999984741, + "num_tokens": 7342250826.0, + "step": 14363 + }, + { + "epoch": 3.884261763115197, + "grad_norm": 2.84375, + "learning_rate": 0.004251033741594377, + "loss": 2.7448, + "mean_token_accuracy": 0.4317265748977661, + "num_tokens": 7342775094.0, + "step": 14364 + }, + { + "epoch": 3.8845321795565173, + "grad_norm": 3.203125, + "learning_rate": 0.004249990890259108, + "loss": 2.6832, + "mean_token_accuracy": 0.44783997535705566, + "num_tokens": 7343267730.0, + "step": 14365 + }, + { + "epoch": 3.8848025959978365, + "grad_norm": 3.1875, + "learning_rate": 0.004248948246033968, + "loss": 2.8862, + "mean_token_accuracy": 0.42502930760383606, + "num_tokens": 7343791879.0, + "step": 14366 + }, + { + "epoch": 3.885073012439156, + "grad_norm": 2.875, + "learning_rate": 0.0042479058089509485, + "loss": 2.8119, + "mean_token_accuracy": 0.438294380903244, + "num_tokens": 7344316119.0, + "step": 14367 + }, + { + "epoch": 3.885343428880476, + "grad_norm": 3.3125, + "learning_rate": 0.004246863579042038, + "loss": 2.8313, + "mean_token_accuracy": 0.43870046734809875, + "num_tokens": 7344840198.0, + "step": 14368 + }, + { + "epoch": 3.8856138453217954, + "grad_norm": 3.1875, + "learning_rate": 0.00424582155633921, + "loss": 2.9876, + "mean_token_accuracy": 0.4224836230278015, + "num_tokens": 7345317982.0, + "step": 14369 + }, + { + "epoch": 3.885884261763115, + "grad_norm": 3.03125, + "learning_rate": 0.004244779740874442, + "loss": 2.8262, + "mean_token_accuracy": 0.42974311113357544, + "num_tokens": 7345842191.0, + "step": 14370 + }, + { + "epoch": 3.8861546782044347, + "grad_norm": 4.96875, + "learning_rate": 0.004243738132679698, + "loss": 9.9631, + "mean_token_accuracy": 0.011156070977449417, + "num_tokens": 7346366395.0, + "step": 14371 + }, + { + "epoch": 3.8864250946457544, + "grad_norm": 6.4375, + "learning_rate": 0.0042426967317869355, + "loss": 3.0665, + "mean_token_accuracy": 0.41308438777923584, + "num_tokens": 7346867091.0, + "step": 14372 + }, + { + "epoch": 3.886695511087074, + "grad_norm": 2.234375, + "learning_rate": 0.00424165553822811, + "loss": 2.93, + "mean_token_accuracy": 0.43666738271713257, + "num_tokens": 7347391129.0, + "step": 14373 + }, + { + "epoch": 3.8869659275283936, + "grad_norm": 3.3125, + "learning_rate": 0.004240614552035168, + "loss": 2.7812, + "mean_token_accuracy": 0.430039644241333, + "num_tokens": 7347915157.0, + "step": 14374 + }, + { + "epoch": 3.8872363439697133, + "grad_norm": 4.46875, + "learning_rate": 0.004239573773240047, + "loss": 2.6691, + "mean_token_accuracy": 0.430366188287735, + "num_tokens": 7348439431.0, + "step": 14375 + }, + { + "epoch": 3.887506760411033, + "grad_norm": 2.78125, + "learning_rate": 0.004238533201874685, + "loss": 2.8514, + "mean_token_accuracy": 0.4632265567779541, + "num_tokens": 7348863402.0, + "step": 14376 + }, + { + "epoch": 3.8877771768523526, + "grad_norm": 2.953125, + "learning_rate": 0.00423749283797101, + "loss": 2.8497, + "mean_token_accuracy": 0.4151856303215027, + "num_tokens": 7349339104.0, + "step": 14377 + }, + { + "epoch": 3.888047593293672, + "grad_norm": 4.6875, + "learning_rate": 0.0042364526815609396, + "loss": 2.464, + "mean_token_accuracy": 0.4825115501880646, + "num_tokens": 7349863385.0, + "step": 14378 + }, + { + "epoch": 3.888318009734992, + "grad_norm": 2.21875, + "learning_rate": 0.0042354127326763935, + "loss": 2.9142, + "mean_token_accuracy": 0.4266738295555115, + "num_tokens": 7350387647.0, + "step": 14379 + }, + { + "epoch": 3.8885884261763115, + "grad_norm": 2.6875, + "learning_rate": 0.004234372991349275, + "loss": 2.7079, + "mean_token_accuracy": 0.44504594802856445, + "num_tokens": 7350911839.0, + "step": 14380 + }, + { + "epoch": 3.888858842617631, + "grad_norm": 3.390625, + "learning_rate": 0.004233333457611491, + "loss": 3.0212, + "mean_token_accuracy": 0.4325231909751892, + "num_tokens": 7351418293.0, + "step": 14381 + }, + { + "epoch": 3.889129259058951, + "grad_norm": 3.90625, + "learning_rate": 0.004232294131494937, + "loss": 2.8855, + "mean_token_accuracy": 0.4351045787334442, + "num_tokens": 7351942564.0, + "step": 14382 + }, + { + "epoch": 3.8893996755002704, + "grad_norm": 3.546875, + "learning_rate": 0.004231255013031502, + "loss": 2.6208, + "mean_token_accuracy": 0.4646238386631012, + "num_tokens": 7352444828.0, + "step": 14383 + }, + { + "epoch": 3.88967009194159, + "grad_norm": 2.75, + "learning_rate": 0.0042302161022530664, + "loss": 2.9023, + "mean_token_accuracy": 0.4478556513786316, + "num_tokens": 7352957281.0, + "step": 14384 + }, + { + "epoch": 3.8899405083829097, + "grad_norm": 3.140625, + "learning_rate": 0.004229177399191512, + "loss": 2.7997, + "mean_token_accuracy": 0.4295538067817688, + "num_tokens": 7353481469.0, + "step": 14385 + }, + { + "epoch": 3.8902109248242294, + "grad_norm": 2.796875, + "learning_rate": 0.004228138903878707, + "loss": 2.831, + "mean_token_accuracy": 0.4317571520805359, + "num_tokens": 7353981988.0, + "step": 14386 + }, + { + "epoch": 3.890481341265549, + "grad_norm": 3.0625, + "learning_rate": 0.004227100616346512, + "loss": 2.8878, + "mean_token_accuracy": 0.43403732776641846, + "num_tokens": 7354506259.0, + "step": 14387 + }, + { + "epoch": 3.8907517577068687, + "grad_norm": 3.328125, + "learning_rate": 0.004226062536626792, + "loss": 2.792, + "mean_token_accuracy": 0.4279636740684509, + "num_tokens": 7355000361.0, + "step": 14388 + }, + { + "epoch": 3.8910221741481883, + "grad_norm": 2.71875, + "learning_rate": 0.004225024664751389, + "loss": 2.8926, + "mean_token_accuracy": 0.4159470796585083, + "num_tokens": 7355520421.0, + "step": 14389 + }, + { + "epoch": 3.891292590589508, + "grad_norm": 3.3125, + "learning_rate": 0.004223987000752157, + "loss": 2.9444, + "mean_token_accuracy": 0.42044442892074585, + "num_tokens": 7356044697.0, + "step": 14390 + }, + { + "epoch": 3.8915630070308276, + "grad_norm": 7.03125, + "learning_rate": 0.004222949544660931, + "loss": 9.6195, + "mean_token_accuracy": 0.020528588443994522, + "num_tokens": 7356568862.0, + "step": 14391 + }, + { + "epoch": 3.8918334234721472, + "grad_norm": 5.90625, + "learning_rate": 0.00422191229650954, + "loss": 3.0348, + "mean_token_accuracy": 0.41486871242523193, + "num_tokens": 7357040280.0, + "step": 14392 + }, + { + "epoch": 3.892103839913467, + "grad_norm": 2.75, + "learning_rate": 0.0042208752563298165, + "loss": 2.7265, + "mean_token_accuracy": 0.42702731490135193, + "num_tokens": 7357564496.0, + "step": 14393 + }, + { + "epoch": 3.8923742563547865, + "grad_norm": 6.65625, + "learning_rate": 0.004219838424153573, + "loss": 2.5446, + "mean_token_accuracy": 0.4510859251022339, + "num_tokens": 7358088593.0, + "step": 14394 + }, + { + "epoch": 3.892644672796106, + "grad_norm": 3.5625, + "learning_rate": 0.004218801800012628, + "loss": 2.7991, + "mean_token_accuracy": 0.4233565330505371, + "num_tokens": 7358612816.0, + "step": 14395 + }, + { + "epoch": 3.892915089237426, + "grad_norm": 3.0625, + "learning_rate": 0.004217765383938784, + "loss": 3.1321, + "mean_token_accuracy": 0.3761550784111023, + "num_tokens": 7359137029.0, + "step": 14396 + }, + { + "epoch": 3.893185505678745, + "grad_norm": 3.609375, + "learning_rate": 0.004216729175963845, + "loss": 2.9532, + "mean_token_accuracy": 0.4211784303188324, + "num_tokens": 7359661245.0, + "step": 14397 + }, + { + "epoch": 3.893455922120065, + "grad_norm": 3.3125, + "learning_rate": 0.0042156931761196, + "loss": 2.8266, + "mean_token_accuracy": 0.43191787600517273, + "num_tokens": 7360185512.0, + "step": 14398 + }, + { + "epoch": 3.8937263385613843, + "grad_norm": 3.03125, + "learning_rate": 0.0042146573844378445, + "loss": 2.6876, + "mean_token_accuracy": 0.4628722071647644, + "num_tokens": 7360698357.0, + "step": 14399 + }, + { + "epoch": 3.8939967550027044, + "grad_norm": 3.65625, + "learning_rate": 0.004213621800950354, + "loss": 2.8421, + "mean_token_accuracy": 0.441838800907135, + "num_tokens": 7361222541.0, + "step": 14400 + }, + { + "epoch": 3.8942671714440236, + "grad_norm": 3.140625, + "learning_rate": 0.004212586425688902, + "loss": 2.7435, + "mean_token_accuracy": 0.4499853253364563, + "num_tokens": 7361746822.0, + "step": 14401 + }, + { + "epoch": 3.8945375878853437, + "grad_norm": 3.140625, + "learning_rate": 0.004211551258685259, + "loss": 2.9316, + "mean_token_accuracy": 0.4162077307701111, + "num_tokens": 7362270990.0, + "step": 14402 + }, + { + "epoch": 3.894808004326663, + "grad_norm": 3.203125, + "learning_rate": 0.004210516299971189, + "loss": 3.0381, + "mean_token_accuracy": 0.4165886640548706, + "num_tokens": 7362773802.0, + "step": 14403 + }, + { + "epoch": 3.895078420767983, + "grad_norm": 4.3125, + "learning_rate": 0.004209481549578444, + "loss": 2.3884, + "mean_token_accuracy": 0.48904985189437866, + "num_tokens": 7363298036.0, + "step": 14404 + }, + { + "epoch": 3.895348837209302, + "grad_norm": 2.828125, + "learning_rate": 0.004208447007538774, + "loss": 2.5359, + "mean_token_accuracy": 0.4949752986431122, + "num_tokens": 7363761781.0, + "step": 14405 + }, + { + "epoch": 3.8956192536506222, + "grad_norm": 2.234375, + "learning_rate": 0.004207412673883925, + "loss": 2.9654, + "mean_token_accuracy": 0.43770912289619446, + "num_tokens": 7364240418.0, + "step": 14406 + }, + { + "epoch": 3.8958896700919414, + "grad_norm": 2.484375, + "learning_rate": 0.004206378548645627, + "loss": 2.7814, + "mean_token_accuracy": 0.43478283286094666, + "num_tokens": 7364764417.0, + "step": 14407 + }, + { + "epoch": 3.896160086533261, + "grad_norm": 3.25, + "learning_rate": 0.004205344631855618, + "loss": 2.8418, + "mean_token_accuracy": 0.43258339166641235, + "num_tokens": 7365282880.0, + "step": 14408 + }, + { + "epoch": 3.8964305029745807, + "grad_norm": 2.390625, + "learning_rate": 0.0042043109235456125, + "loss": 3.0377, + "mean_token_accuracy": 0.4053313136100769, + "num_tokens": 7365807005.0, + "step": 14409 + }, + { + "epoch": 3.8967009194159004, + "grad_norm": 3.0625, + "learning_rate": 0.004203277423747335, + "loss": 2.8821, + "mean_token_accuracy": 0.43404117226600647, + "num_tokens": 7366331204.0, + "step": 14410 + }, + { + "epoch": 3.89697133585722, + "grad_norm": 17.375, + "learning_rate": 0.004202244132492495, + "loss": 11.8679, + "mean_token_accuracy": 0.03465699404478073, + "num_tokens": 7366831122.0, + "step": 14411 + }, + { + "epoch": 3.8972417522985396, + "grad_norm": 6.375, + "learning_rate": 0.004201211049812794, + "loss": 3.013, + "mean_token_accuracy": 0.4124014377593994, + "num_tokens": 7367306573.0, + "step": 14412 + }, + { + "epoch": 3.8975121687398593, + "grad_norm": 2.53125, + "learning_rate": 0.0042001781757399345, + "loss": 2.8201, + "mean_token_accuracy": 0.4684382677078247, + "num_tokens": 7367765958.0, + "step": 14413 + }, + { + "epoch": 3.897782585181179, + "grad_norm": 3.484375, + "learning_rate": 0.0041991455103056055, + "loss": 2.8361, + "mean_token_accuracy": 0.4351201057434082, + "num_tokens": 7368290009.0, + "step": 14414 + }, + { + "epoch": 3.8980530016224986, + "grad_norm": 3.296875, + "learning_rate": 0.00419811305354149, + "loss": 2.659, + "mean_token_accuracy": 0.45136985182762146, + "num_tokens": 7368759686.0, + "step": 14415 + }, + { + "epoch": 3.898323418063818, + "grad_norm": 2.515625, + "learning_rate": 0.0041970808054792716, + "loss": 2.8644, + "mean_token_accuracy": 0.42660051584243774, + "num_tokens": 7369283868.0, + "step": 14416 + }, + { + "epoch": 3.898593834505138, + "grad_norm": 3.484375, + "learning_rate": 0.0041960487661506195, + "loss": 2.9259, + "mean_token_accuracy": 0.41645416617393494, + "num_tokens": 7369739851.0, + "step": 14417 + }, + { + "epoch": 3.8988642509464575, + "grad_norm": 2.890625, + "learning_rate": 0.004195016935587198, + "loss": 2.8617, + "mean_token_accuracy": 0.4354107975959778, + "num_tokens": 7370264000.0, + "step": 14418 + }, + { + "epoch": 3.899134667387777, + "grad_norm": 3.0, + "learning_rate": 0.0041939853138206725, + "loss": 2.7731, + "mean_token_accuracy": 0.43076086044311523, + "num_tokens": 7370788071.0, + "step": 14419 + }, + { + "epoch": 3.899405083829097, + "grad_norm": 2.71875, + "learning_rate": 0.004192953900882692, + "loss": 2.834, + "mean_token_accuracy": 0.4359889030456543, + "num_tokens": 7371312322.0, + "step": 14420 + }, + { + "epoch": 3.8996755002704164, + "grad_norm": 3.140625, + "learning_rate": 0.004191922696804902, + "loss": 2.812, + "mean_token_accuracy": 0.4438542425632477, + "num_tokens": 7371747780.0, + "step": 14421 + }, + { + "epoch": 3.899945916711736, + "grad_norm": 2.875, + "learning_rate": 0.0041908917016189465, + "loss": 2.8614, + "mean_token_accuracy": 0.43768054246902466, + "num_tokens": 7372272028.0, + "step": 14422 + }, + { + "epoch": 3.9002163331530557, + "grad_norm": 2.984375, + "learning_rate": 0.004189860915356456, + "loss": 2.9534, + "mean_token_accuracy": 0.42864441871643066, + "num_tokens": 7372796307.0, + "step": 14423 + }, + { + "epoch": 3.9004867495943754, + "grad_norm": 3.53125, + "learning_rate": 0.00418883033804906, + "loss": 2.8438, + "mean_token_accuracy": 0.434850811958313, + "num_tokens": 7373320491.0, + "step": 14424 + }, + { + "epoch": 3.900757166035695, + "grad_norm": 3.046875, + "learning_rate": 0.004187799969728381, + "loss": 2.9881, + "mean_token_accuracy": 0.4216166138648987, + "num_tokens": 7373844656.0, + "step": 14425 + }, + { + "epoch": 3.9010275824770146, + "grad_norm": 3.515625, + "learning_rate": 0.004186769810426034, + "loss": 2.9408, + "mean_token_accuracy": 0.4271412789821625, + "num_tokens": 7374368859.0, + "step": 14426 + }, + { + "epoch": 3.9012979989183343, + "grad_norm": 3.046875, + "learning_rate": 0.004185739860173622, + "loss": 3.0096, + "mean_token_accuracy": 0.41861432790756226, + "num_tokens": 7374893074.0, + "step": 14427 + }, + { + "epoch": 3.901568415359654, + "grad_norm": 3.671875, + "learning_rate": 0.004184710119002753, + "loss": 2.9563, + "mean_token_accuracy": 0.42840611934661865, + "num_tokens": 7375417232.0, + "step": 14428 + }, + { + "epoch": 3.9018388318009736, + "grad_norm": 2.828125, + "learning_rate": 0.004183680586945022, + "loss": 2.8955, + "mean_token_accuracy": 0.4225202798843384, + "num_tokens": 7375907015.0, + "step": 14429 + }, + { + "epoch": 3.902109248242293, + "grad_norm": 3.296875, + "learning_rate": 0.004182651264032011, + "loss": 2.6514, + "mean_token_accuracy": 0.437109112739563, + "num_tokens": 7376382545.0, + "step": 14430 + }, + { + "epoch": 3.902379664683613, + "grad_norm": 168.0, + "learning_rate": 0.004181622150295311, + "loss": 9.8077, + "mean_token_accuracy": 0.009117785841226578, + "num_tokens": 7376906562.0, + "step": 14431 + }, + { + "epoch": 3.9026500811249325, + "grad_norm": 6.53125, + "learning_rate": 0.0041805932457664955, + "loss": 2.9301, + "mean_token_accuracy": 0.4361448287963867, + "num_tokens": 7377430806.0, + "step": 14432 + }, + { + "epoch": 3.902920497566252, + "grad_norm": 2.734375, + "learning_rate": 0.004179564550477131, + "loss": 2.7452, + "mean_token_accuracy": 0.43299418687820435, + "num_tokens": 7377955051.0, + "step": 14433 + }, + { + "epoch": 3.903190914007572, + "grad_norm": 2.796875, + "learning_rate": 0.004178536064458786, + "loss": 2.989, + "mean_token_accuracy": 0.41856831312179565, + "num_tokens": 7378479194.0, + "step": 14434 + }, + { + "epoch": 3.9034613304488914, + "grad_norm": 3.53125, + "learning_rate": 0.004177507787743013, + "loss": 2.9125, + "mean_token_accuracy": 0.41700318455696106, + "num_tokens": 7379003370.0, + "step": 14435 + }, + { + "epoch": 3.903731746890211, + "grad_norm": 3.515625, + "learning_rate": 0.004176479720361367, + "loss": 2.911, + "mean_token_accuracy": 0.4251757264137268, + "num_tokens": 7379527517.0, + "step": 14436 + }, + { + "epoch": 3.9040021633315307, + "grad_norm": 3.8125, + "learning_rate": 0.004175451862345387, + "loss": 2.9683, + "mean_token_accuracy": 0.42637524008750916, + "num_tokens": 7380007935.0, + "step": 14437 + }, + { + "epoch": 3.90427257977285, + "grad_norm": 3.296875, + "learning_rate": 0.004174424213726617, + "loss": 2.7624, + "mean_token_accuracy": 0.4548013508319855, + "num_tokens": 7380499412.0, + "step": 14438 + }, + { + "epoch": 3.90454299621417, + "grad_norm": 3.3125, + "learning_rate": 0.004173396774536581, + "loss": 2.9994, + "mean_token_accuracy": 0.4168661832809448, + "num_tokens": 7381023574.0, + "step": 14439 + }, + { + "epoch": 3.904813412655489, + "grad_norm": 3.15625, + "learning_rate": 0.004172369544806811, + "loss": 2.7999, + "mean_token_accuracy": 0.44551581144332886, + "num_tokens": 7381547768.0, + "step": 14440 + }, + { + "epoch": 3.9050838290968093, + "grad_norm": 3.609375, + "learning_rate": 0.0041713425245688196, + "loss": 2.8909, + "mean_token_accuracy": 0.4244677722454071, + "num_tokens": 7382071945.0, + "step": 14441 + }, + { + "epoch": 3.9053542455381285, + "grad_norm": 2.984375, + "learning_rate": 0.004170315713854123, + "loss": 2.9344, + "mean_token_accuracy": 0.43555688858032227, + "num_tokens": 7382553553.0, + "step": 14442 + }, + { + "epoch": 3.9056246619794486, + "grad_norm": 4.1875, + "learning_rate": 0.004169289112694226, + "loss": 2.7246, + "mean_token_accuracy": 0.4618489742279053, + "num_tokens": 7383077690.0, + "step": 14443 + }, + { + "epoch": 3.9058950784207678, + "grad_norm": 2.09375, + "learning_rate": 0.004168262721120625, + "loss": 2.9622, + "mean_token_accuracy": 0.42658695578575134, + "num_tokens": 7383601976.0, + "step": 14444 + }, + { + "epoch": 3.906165494862088, + "grad_norm": 3.0625, + "learning_rate": 0.004167236539164816, + "loss": 2.8856, + "mean_token_accuracy": 0.41843530535697937, + "num_tokens": 7384126114.0, + "step": 14445 + }, + { + "epoch": 3.906435911303407, + "grad_norm": 2.9375, + "learning_rate": 0.004166210566858284, + "loss": 2.7618, + "mean_token_accuracy": 0.44435253739356995, + "num_tokens": 7384650196.0, + "step": 14446 + }, + { + "epoch": 3.906706327744727, + "grad_norm": 3.40625, + "learning_rate": 0.004165184804232506, + "loss": 2.7559, + "mean_token_accuracy": 0.42912042140960693, + "num_tokens": 7385174355.0, + "step": 14447 + }, + { + "epoch": 3.9069767441860463, + "grad_norm": 2.53125, + "learning_rate": 0.004164159251318961, + "loss": 2.841, + "mean_token_accuracy": 0.42987388372421265, + "num_tokens": 7385698515.0, + "step": 14448 + }, + { + "epoch": 3.907247160627366, + "grad_norm": 2.90625, + "learning_rate": 0.004163133908149113, + "loss": 2.6801, + "mean_token_accuracy": 0.4389611482620239, + "num_tokens": 7386222750.0, + "step": 14449 + }, + { + "epoch": 3.9075175770686856, + "grad_norm": 2.78125, + "learning_rate": 0.00416210877475442, + "loss": 2.6935, + "mean_token_accuracy": 0.4624074697494507, + "num_tokens": 7386722712.0, + "step": 14450 + }, + { + "epoch": 3.9077879935100053, + "grad_norm": 7.46875, + "learning_rate": 0.004161083851166339, + "loss": 9.8385, + "mean_token_accuracy": 0.02883523516356945, + "num_tokens": 7387246911.0, + "step": 14451 + }, + { + "epoch": 3.908058409951325, + "grad_norm": 6.53125, + "learning_rate": 0.004160059137416319, + "loss": 2.9336, + "mean_token_accuracy": 0.4160611927509308, + "num_tokens": 7387740838.0, + "step": 14452 + }, + { + "epoch": 3.9083288263926446, + "grad_norm": 2.125, + "learning_rate": 0.004159034633535797, + "loss": 2.8651, + "mean_token_accuracy": 0.43018639087677, + "num_tokens": 7388237080.0, + "step": 14453 + }, + { + "epoch": 3.908599242833964, + "grad_norm": 3.0, + "learning_rate": 0.004158010339556215, + "loss": 2.8004, + "mean_token_accuracy": 0.4453688859939575, + "num_tokens": 7388761313.0, + "step": 14454 + }, + { + "epoch": 3.908869659275284, + "grad_norm": 3.640625, + "learning_rate": 0.004156986255508996, + "loss": 3.0164, + "mean_token_accuracy": 0.4309620261192322, + "num_tokens": 7389285596.0, + "step": 14455 + }, + { + "epoch": 3.9091400757166035, + "grad_norm": 3.640625, + "learning_rate": 0.004155962381425559, + "loss": 2.9166, + "mean_token_accuracy": 0.424515962600708, + "num_tokens": 7389809879.0, + "step": 14456 + }, + { + "epoch": 3.909410492157923, + "grad_norm": 3.015625, + "learning_rate": 0.004154938717337326, + "loss": 2.9217, + "mean_token_accuracy": 0.40058642625808716, + "num_tokens": 7390292275.0, + "step": 14457 + }, + { + "epoch": 3.9096809085992428, + "grad_norm": 3.03125, + "learning_rate": 0.0041539152632757, + "loss": 2.8503, + "mean_token_accuracy": 0.43801504373550415, + "num_tokens": 7390794873.0, + "step": 14458 + }, + { + "epoch": 3.9099513250405624, + "grad_norm": 3.046875, + "learning_rate": 0.004152892019272091, + "loss": 2.7933, + "mean_token_accuracy": 0.4469650685787201, + "num_tokens": 7391319081.0, + "step": 14459 + }, + { + "epoch": 3.910221741481882, + "grad_norm": 3.421875, + "learning_rate": 0.00415186898535789, + "loss": 2.8616, + "mean_token_accuracy": 0.4358375072479248, + "num_tokens": 7391822199.0, + "step": 14460 + }, + { + "epoch": 3.9104921579232017, + "grad_norm": 2.890625, + "learning_rate": 0.004150846161564485, + "loss": 2.8765, + "mean_token_accuracy": 0.436746209859848, + "num_tokens": 7392346396.0, + "step": 14461 + }, + { + "epoch": 3.9107625743645213, + "grad_norm": 3.671875, + "learning_rate": 0.0041498235479232645, + "loss": 2.634, + "mean_token_accuracy": 0.43514883518218994, + "num_tokens": 7392826058.0, + "step": 14462 + }, + { + "epoch": 3.911032990805841, + "grad_norm": 2.859375, + "learning_rate": 0.004148801144465602, + "loss": 2.8495, + "mean_token_accuracy": 0.4157467782497406, + "num_tokens": 7393350317.0, + "step": 14463 + }, + { + "epoch": 3.9113034072471606, + "grad_norm": 3.703125, + "learning_rate": 0.004147778951222867, + "loss": 2.741, + "mean_token_accuracy": 0.4425390958786011, + "num_tokens": 7393837992.0, + "step": 14464 + }, + { + "epoch": 3.9115738236884803, + "grad_norm": 3.65625, + "learning_rate": 0.0041467569682264276, + "loss": 2.7675, + "mean_token_accuracy": 0.40404680371284485, + "num_tokens": 7394362252.0, + "step": 14465 + }, + { + "epoch": 3.9118442401298, + "grad_norm": 3.484375, + "learning_rate": 0.004145735195507635, + "loss": 2.8069, + "mean_token_accuracy": 0.4335874915122986, + "num_tokens": 7394876250.0, + "step": 14466 + }, + { + "epoch": 3.9121146565711196, + "grad_norm": 3.8125, + "learning_rate": 0.004144713633097844, + "loss": 2.9821, + "mean_token_accuracy": 0.4735887944698334, + "num_tokens": 7395276794.0, + "step": 14467 + }, + { + "epoch": 3.912385073012439, + "grad_norm": 3.515625, + "learning_rate": 0.0041436922810284, + "loss": 2.9439, + "mean_token_accuracy": 0.43426138162612915, + "num_tokens": 7395749716.0, + "step": 14468 + }, + { + "epoch": 3.912655489453759, + "grad_norm": 3.34375, + "learning_rate": 0.004142671139330641, + "loss": 2.8469, + "mean_token_accuracy": 0.4444758892059326, + "num_tokens": 7396273973.0, + "step": 14469 + }, + { + "epoch": 3.9129259058950785, + "grad_norm": 3.5625, + "learning_rate": 0.004141650208035895, + "loss": 2.865, + "mean_token_accuracy": 0.42348697781562805, + "num_tokens": 7396798180.0, + "step": 14470 + }, + { + "epoch": 3.913196322336398, + "grad_norm": 50.5, + "learning_rate": 0.004140629487175493, + "loss": 16.4045, + "mean_token_accuracy": 0.0006688497378490865, + "num_tokens": 7397322347.0, + "step": 14471 + }, + { + "epoch": 3.913466738777718, + "grad_norm": 6.59375, + "learning_rate": 0.0041396089767807495, + "loss": 2.9393, + "mean_token_accuracy": 0.41416293382644653, + "num_tokens": 7397846623.0, + "step": 14472 + }, + { + "epoch": 3.9137371552190374, + "grad_norm": 2.140625, + "learning_rate": 0.004138588676882976, + "loss": 2.9166, + "mean_token_accuracy": 0.42115435004234314, + "num_tokens": 7398370872.0, + "step": 14473 + }, + { + "epoch": 3.914007571660357, + "grad_norm": 3.203125, + "learning_rate": 0.004137568587513482, + "loss": 2.678, + "mean_token_accuracy": 0.43677759170532227, + "num_tokens": 7398895054.0, + "step": 14474 + }, + { + "epoch": 3.9142779881016767, + "grad_norm": 2.921875, + "learning_rate": 0.004136548708703565, + "loss": 2.7745, + "mean_token_accuracy": 0.442595899105072, + "num_tokens": 7399419299.0, + "step": 14475 + }, + { + "epoch": 3.9145484045429964, + "grad_norm": 3.234375, + "learning_rate": 0.004135529040484515, + "loss": 3.0303, + "mean_token_accuracy": 0.4212965965270996, + "num_tokens": 7399909071.0, + "step": 14476 + }, + { + "epoch": 3.914818820984316, + "grad_norm": 3.21875, + "learning_rate": 0.004134509582887623, + "loss": 2.8527, + "mean_token_accuracy": 0.4381483197212219, + "num_tokens": 7400433338.0, + "step": 14477 + }, + { + "epoch": 3.9150892374256356, + "grad_norm": 3.5, + "learning_rate": 0.0041334903359441675, + "loss": 2.7824, + "mean_token_accuracy": 0.44239699840545654, + "num_tokens": 7400957530.0, + "step": 14478 + }, + { + "epoch": 3.915359653866955, + "grad_norm": 3.390625, + "learning_rate": 0.004132471299685419, + "loss": 3.008, + "mean_token_accuracy": 0.423197865486145, + "num_tokens": 7401479932.0, + "step": 14479 + }, + { + "epoch": 3.915630070308275, + "grad_norm": 3.5, + "learning_rate": 0.004131452474142648, + "loss": 2.9593, + "mean_token_accuracy": 0.4413505494594574, + "num_tokens": 7401943937.0, + "step": 14480 + }, + { + "epoch": 3.915900486749594, + "grad_norm": 3.4375, + "learning_rate": 0.004130433859347115, + "loss": 2.9035, + "mean_token_accuracy": 0.44396138191223145, + "num_tokens": 7402409349.0, + "step": 14481 + }, + { + "epoch": 3.916170903190914, + "grad_norm": 3.46875, + "learning_rate": 0.004129415455330073, + "loss": 2.9699, + "mean_token_accuracy": 0.4189565181732178, + "num_tokens": 7402933587.0, + "step": 14482 + }, + { + "epoch": 3.9164413196322334, + "grad_norm": 3.140625, + "learning_rate": 0.004128397262122771, + "loss": 2.7398, + "mean_token_accuracy": 0.43993932008743286, + "num_tokens": 7403457801.0, + "step": 14483 + }, + { + "epoch": 3.9167117360735535, + "grad_norm": 3.328125, + "learning_rate": 0.004127379279756447, + "loss": 2.7619, + "mean_token_accuracy": 0.4313555955886841, + "num_tokens": 7403982083.0, + "step": 14484 + }, + { + "epoch": 3.9169821525148727, + "grad_norm": 3.4375, + "learning_rate": 0.00412636150826234, + "loss": 2.8748, + "mean_token_accuracy": 0.4281260371208191, + "num_tokens": 7404448477.0, + "step": 14485 + }, + { + "epoch": 3.917252568956193, + "grad_norm": 3.71875, + "learning_rate": 0.0041253439476716764, + "loss": 2.6721, + "mean_token_accuracy": 0.4605129063129425, + "num_tokens": 7404937246.0, + "step": 14486 + }, + { + "epoch": 3.917522985397512, + "grad_norm": 2.453125, + "learning_rate": 0.0041243265980156745, + "loss": 2.8343, + "mean_token_accuracy": 0.42474132776260376, + "num_tokens": 7405461520.0, + "step": 14487 + }, + { + "epoch": 3.917793401838832, + "grad_norm": 3.328125, + "learning_rate": 0.004123309459325555, + "loss": 2.7702, + "mean_token_accuracy": 0.4362671375274658, + "num_tokens": 7405985593.0, + "step": 14488 + }, + { + "epoch": 3.9180638182801513, + "grad_norm": 3.078125, + "learning_rate": 0.004122292531632524, + "loss": 2.9871, + "mean_token_accuracy": 0.4186939001083374, + "num_tokens": 7406509798.0, + "step": 14489 + }, + { + "epoch": 3.918334234721471, + "grad_norm": 3.609375, + "learning_rate": 0.004121275814967783, + "loss": 2.7085, + "mean_token_accuracy": 0.44134950637817383, + "num_tokens": 7407033952.0, + "step": 14490 + }, + { + "epoch": 3.9186046511627906, + "grad_norm": 4.40625, + "learning_rate": 0.0041202593093625315, + "loss": 8.8543, + "mean_token_accuracy": 0.010179148986935616, + "num_tokens": 7407558159.0, + "step": 14491 + }, + { + "epoch": 3.91887506760411, + "grad_norm": 5.625, + "learning_rate": 0.004119243014847955, + "loss": 2.9393, + "mean_token_accuracy": 0.41711220145225525, + "num_tokens": 7408082383.0, + "step": 14492 + }, + { + "epoch": 3.91914548404543, + "grad_norm": 2.25, + "learning_rate": 0.004118226931455236, + "loss": 2.889, + "mean_token_accuracy": 0.4370530843734741, + "num_tokens": 7408606654.0, + "step": 14493 + }, + { + "epoch": 3.9194159004867495, + "grad_norm": 2.984375, + "learning_rate": 0.004117211059215553, + "loss": 2.9019, + "mean_token_accuracy": 0.43566828966140747, + "num_tokens": 7409130879.0, + "step": 14494 + }, + { + "epoch": 3.919686316928069, + "grad_norm": 2.890625, + "learning_rate": 0.004116195398160077, + "loss": 2.8107, + "mean_token_accuracy": 0.41807565093040466, + "num_tokens": 7409655110.0, + "step": 14495 + }, + { + "epoch": 3.9199567333693888, + "grad_norm": 2.984375, + "learning_rate": 0.0041151799483199695, + "loss": 2.9213, + "mean_token_accuracy": 0.4242687225341797, + "num_tokens": 7410179319.0, + "step": 14496 + }, + { + "epoch": 3.9202271498107084, + "grad_norm": 3.46875, + "learning_rate": 0.00411416470972639, + "loss": 2.7612, + "mean_token_accuracy": 0.44427746534347534, + "num_tokens": 7410703553.0, + "step": 14497 + }, + { + "epoch": 3.920497566252028, + "grad_norm": 2.921875, + "learning_rate": 0.004113149682410487, + "loss": 2.9416, + "mean_token_accuracy": 0.4071078598499298, + "num_tokens": 7411227766.0, + "step": 14498 + }, + { + "epoch": 3.9207679826933477, + "grad_norm": 3.578125, + "learning_rate": 0.004112134866403401, + "loss": 2.8068, + "mean_token_accuracy": 0.43674737215042114, + "num_tokens": 7411752023.0, + "step": 14499 + }, + { + "epoch": 3.9210383991346673, + "grad_norm": 3.015625, + "learning_rate": 0.004111120261736276, + "loss": 2.939, + "mean_token_accuracy": 0.44539913535118103, + "num_tokens": 7412249959.0, + "step": 14500 + }, + { + "epoch": 3.921308815575987, + "grad_norm": 3.71875, + "learning_rate": 0.00411010586844024, + "loss": 2.7002, + "mean_token_accuracy": 0.43998977541923523, + "num_tokens": 7412774205.0, + "step": 14501 + }, + { + "epoch": 3.9215792320173066, + "grad_norm": 3.40625, + "learning_rate": 0.0041090916865464166, + "loss": 2.8359, + "mean_token_accuracy": 0.4274541437625885, + "num_tokens": 7413298359.0, + "step": 14502 + }, + { + "epoch": 3.9218496484586263, + "grad_norm": 3.8125, + "learning_rate": 0.0041080777160859274, + "loss": 2.9717, + "mean_token_accuracy": 0.424823135137558, + "num_tokens": 7413822626.0, + "step": 14503 + }, + { + "epoch": 3.922120064899946, + "grad_norm": 3.21875, + "learning_rate": 0.0041070639570898815, + "loss": 2.7561, + "mean_token_accuracy": 0.4254448413848877, + "num_tokens": 7414346833.0, + "step": 14504 + }, + { + "epoch": 3.9223904813412656, + "grad_norm": 3.21875, + "learning_rate": 0.0041060504095893815, + "loss": 2.8534, + "mean_token_accuracy": 0.4248133599758148, + "num_tokens": 7414871034.0, + "step": 14505 + }, + { + "epoch": 3.922660897782585, + "grad_norm": 5.375, + "learning_rate": 0.004105037073615532, + "loss": 2.597, + "mean_token_accuracy": 0.47603997588157654, + "num_tokens": 7415395282.0, + "step": 14506 + }, + { + "epoch": 3.922931314223905, + "grad_norm": 2.578125, + "learning_rate": 0.004104023949199419, + "loss": 2.9133, + "mean_token_accuracy": 0.4302951991558075, + "num_tokens": 7415919460.0, + "step": 14507 + }, + { + "epoch": 3.9232017306652245, + "grad_norm": 4.03125, + "learning_rate": 0.004103011036372133, + "loss": 2.9594, + "mean_token_accuracy": 0.42188766598701477, + "num_tokens": 7416443581.0, + "step": 14508 + }, + { + "epoch": 3.923472147106544, + "grad_norm": 2.828125, + "learning_rate": 0.004101998335164748, + "loss": 2.8021, + "mean_token_accuracy": 0.42911797761917114, + "num_tokens": 7416967837.0, + "step": 14509 + }, + { + "epoch": 3.9237425635478638, + "grad_norm": 3.640625, + "learning_rate": 0.00410098584560834, + "loss": 3.0252, + "mean_token_accuracy": 0.4234534502029419, + "num_tokens": 7417492116.0, + "step": 14510 + }, + { + "epoch": 3.9240129799891834, + "grad_norm": 6.9375, + "learning_rate": 0.0040999735677339786, + "loss": 9.9149, + "mean_token_accuracy": 0.017274999991059303, + "num_tokens": 7418016387.0, + "step": 14511 + }, + { + "epoch": 3.924283396430503, + "grad_norm": 8.1875, + "learning_rate": 0.00409896150157272, + "loss": 3.164, + "mean_token_accuracy": 0.4124196171760559, + "num_tokens": 7418540599.0, + "step": 14512 + }, + { + "epoch": 3.9245538128718227, + "grad_norm": 2.328125, + "learning_rate": 0.004097949647155613, + "loss": 2.9068, + "mean_token_accuracy": 0.4324183762073517, + "num_tokens": 7419064878.0, + "step": 14513 + }, + { + "epoch": 3.9248242293131423, + "grad_norm": 28.25, + "learning_rate": 0.004096938004513713, + "loss": 2.9315, + "mean_token_accuracy": 0.42560386657714844, + "num_tokens": 7419589058.0, + "step": 14514 + }, + { + "epoch": 3.925094645754462, + "grad_norm": 6.40625, + "learning_rate": 0.004095926573678054, + "loss": 2.9459, + "mean_token_accuracy": 0.4180050492286682, + "num_tokens": 7420113316.0, + "step": 14515 + }, + { + "epoch": 3.9253650621957816, + "grad_norm": 2.59375, + "learning_rate": 0.0040949153546796695, + "loss": 2.8544, + "mean_token_accuracy": 0.42467138171195984, + "num_tokens": 7420637593.0, + "step": 14516 + }, + { + "epoch": 3.9256354786371013, + "grad_norm": 2.78125, + "learning_rate": 0.0040939043475495904, + "loss": 2.8849, + "mean_token_accuracy": 0.41937267780303955, + "num_tokens": 7421161833.0, + "step": 14517 + }, + { + "epoch": 3.925905895078421, + "grad_norm": 2.703125, + "learning_rate": 0.004092893552318835, + "loss": 3.0327, + "mean_token_accuracy": 0.4119373857975006, + "num_tokens": 7421686115.0, + "step": 14518 + }, + { + "epoch": 3.9261763115197406, + "grad_norm": 2.8125, + "learning_rate": 0.0040918829690184165, + "loss": 2.7836, + "mean_token_accuracy": 0.4415318965911865, + "num_tokens": 7422210274.0, + "step": 14519 + }, + { + "epoch": 3.9264467279610598, + "grad_norm": 3.203125, + "learning_rate": 0.004090872597679346, + "loss": 2.7413, + "mean_token_accuracy": 0.4222351312637329, + "num_tokens": 7422696296.0, + "step": 14520 + }, + { + "epoch": 3.92671714440238, + "grad_norm": 2.46875, + "learning_rate": 0.00408986243833262, + "loss": 2.7328, + "mean_token_accuracy": 0.45885539054870605, + "num_tokens": 7423182095.0, + "step": 14521 + }, + { + "epoch": 3.926987560843699, + "grad_norm": 3.6875, + "learning_rate": 0.004088852491009238, + "loss": 2.8665, + "mean_token_accuracy": 0.42557018995285034, + "num_tokens": 7423706375.0, + "step": 14522 + }, + { + "epoch": 3.927257977285019, + "grad_norm": 4.0, + "learning_rate": 0.004087842755740184, + "loss": 2.6843, + "mean_token_accuracy": 0.44950366020202637, + "num_tokens": 7424191033.0, + "step": 14523 + }, + { + "epoch": 3.9275283937263383, + "grad_norm": 3.5625, + "learning_rate": 0.0040868332325564435, + "loss": 2.9306, + "mean_token_accuracy": 0.42746812105178833, + "num_tokens": 7424715255.0, + "step": 14524 + }, + { + "epoch": 3.9277988101676584, + "grad_norm": 4.15625, + "learning_rate": 0.004085823921488988, + "loss": 2.9339, + "mean_token_accuracy": 0.4409122169017792, + "num_tokens": 7425239525.0, + "step": 14525 + }, + { + "epoch": 3.9280692266089776, + "grad_norm": 3.9375, + "learning_rate": 0.004084814822568791, + "loss": 2.9849, + "mean_token_accuracy": 0.412855863571167, + "num_tokens": 7425763804.0, + "step": 14526 + }, + { + "epoch": 3.9283396430502977, + "grad_norm": 3.359375, + "learning_rate": 0.0040838059358268095, + "loss": 2.9896, + "mean_token_accuracy": 0.42174220085144043, + "num_tokens": 7426288063.0, + "step": 14527 + }, + { + "epoch": 3.928610059491617, + "grad_norm": 3.65625, + "learning_rate": 0.004082797261294, + "loss": 2.8979, + "mean_token_accuracy": 0.4292142391204834, + "num_tokens": 7426812342.0, + "step": 14528 + }, + { + "epoch": 3.928880475932937, + "grad_norm": 3.203125, + "learning_rate": 0.004081788799001315, + "loss": 2.8618, + "mean_token_accuracy": 0.4444325864315033, + "num_tokens": 7427323945.0, + "step": 14529 + }, + { + "epoch": 3.929150892374256, + "grad_norm": 3.484375, + "learning_rate": 0.0040807805489796916, + "loss": 2.8902, + "mean_token_accuracy": 0.4348742663860321, + "num_tokens": 7427848217.0, + "step": 14530 + }, + { + "epoch": 3.929421308815576, + "grad_norm": 10.0, + "learning_rate": 0.004079772511260072, + "loss": 10.1403, + "mean_token_accuracy": 0.000183373354957439, + "num_tokens": 7428372341.0, + "step": 14531 + }, + { + "epoch": 3.9296917252568955, + "grad_norm": 6.4375, + "learning_rate": 0.004078764685873383, + "loss": 2.9827, + "mean_token_accuracy": 0.42386502027511597, + "num_tokens": 7428896502.0, + "step": 14532 + }, + { + "epoch": 3.929962141698215, + "grad_norm": 2.40625, + "learning_rate": 0.0040777570728505455, + "loss": 2.6671, + "mean_token_accuracy": 0.4389221668243408, + "num_tokens": 7429393798.0, + "step": 14533 + }, + { + "epoch": 3.9302325581395348, + "grad_norm": 6.6875, + "learning_rate": 0.004076749672222479, + "loss": 2.6963, + "mean_token_accuracy": 0.4510369896888733, + "num_tokens": 7429899312.0, + "step": 14534 + }, + { + "epoch": 3.9305029745808544, + "grad_norm": 3.0625, + "learning_rate": 0.004075742484020093, + "loss": 2.7015, + "mean_token_accuracy": 0.4215744733810425, + "num_tokens": 7430423540.0, + "step": 14535 + }, + { + "epoch": 3.930773391022174, + "grad_norm": 2.609375, + "learning_rate": 0.0040747355082742884, + "loss": 2.9954, + "mean_token_accuracy": 0.4054655432701111, + "num_tokens": 7430947649.0, + "step": 14536 + }, + { + "epoch": 3.9310438074634937, + "grad_norm": 3.0, + "learning_rate": 0.004073728745015965, + "loss": 2.9935, + "mean_token_accuracy": 0.44289135932922363, + "num_tokens": 7431415019.0, + "step": 14537 + }, + { + "epoch": 3.9313142239048133, + "grad_norm": 3.25, + "learning_rate": 0.004072722194276014, + "loss": 2.9135, + "mean_token_accuracy": 0.43710795044898987, + "num_tokens": 7431939128.0, + "step": 14538 + }, + { + "epoch": 3.931584640346133, + "grad_norm": 3.34375, + "learning_rate": 0.0040717158560853155, + "loss": 2.8953, + "mean_token_accuracy": 0.42107075452804565, + "num_tokens": 7432463244.0, + "step": 14539 + }, + { + "epoch": 3.9318550567874526, + "grad_norm": 3.15625, + "learning_rate": 0.004070709730474752, + "loss": 2.8749, + "mean_token_accuracy": 0.44039541482925415, + "num_tokens": 7432940077.0, + "step": 14540 + }, + { + "epoch": 3.9321254732287723, + "grad_norm": 4.03125, + "learning_rate": 0.004069703817475191, + "loss": 2.9181, + "mean_token_accuracy": 0.4108377695083618, + "num_tokens": 7433426149.0, + "step": 14541 + }, + { + "epoch": 3.932395889670092, + "grad_norm": 3.625, + "learning_rate": 0.004068698117117495, + "loss": 2.8593, + "mean_token_accuracy": 0.4405953884124756, + "num_tokens": 7433939044.0, + "step": 14542 + }, + { + "epoch": 3.9326663061114115, + "grad_norm": 3.265625, + "learning_rate": 0.004067692629432529, + "loss": 2.857, + "mean_token_accuracy": 0.46511855721473694, + "num_tokens": 7434398384.0, + "step": 14543 + }, + { + "epoch": 3.932936722552731, + "grad_norm": 2.984375, + "learning_rate": 0.004066687354451137, + "loss": 2.9228, + "mean_token_accuracy": 0.4135879874229431, + "num_tokens": 7434922524.0, + "step": 14544 + }, + { + "epoch": 3.933207138994051, + "grad_norm": 6.6875, + "learning_rate": 0.0040656822922041655, + "loss": 2.695, + "mean_token_accuracy": 0.46086132526397705, + "num_tokens": 7435446773.0, + "step": 14545 + }, + { + "epoch": 3.9334775554353705, + "grad_norm": 5.84375, + "learning_rate": 0.004064677442722457, + "loss": 2.5812, + "mean_token_accuracy": 0.49106699228286743, + "num_tokens": 7435971024.0, + "step": 14546 + }, + { + "epoch": 3.93374797187669, + "grad_norm": 2.015625, + "learning_rate": 0.004063672806036839, + "loss": 2.9373, + "mean_token_accuracy": 0.4246155917644501, + "num_tokens": 7436495266.0, + "step": 14547 + }, + { + "epoch": 3.9340183883180098, + "grad_norm": 3.40625, + "learning_rate": 0.004062668382178135, + "loss": 2.6431, + "mean_token_accuracy": 0.4638572335243225, + "num_tokens": 7437019228.0, + "step": 14548 + }, + { + "epoch": 3.9342888047593294, + "grad_norm": 3.46875, + "learning_rate": 0.004061664171177169, + "loss": 2.7661, + "mean_token_accuracy": 0.4398046135902405, + "num_tokens": 7437531021.0, + "step": 14549 + }, + { + "epoch": 3.934559221200649, + "grad_norm": 3.734375, + "learning_rate": 0.004060660173064749, + "loss": 2.7492, + "mean_token_accuracy": 0.43982553482055664, + "num_tokens": 7438055156.0, + "step": 14550 + }, + { + "epoch": 3.9348296376419687, + "grad_norm": 300.0, + "learning_rate": 0.004059656387871679, + "loss": 10.9336, + "mean_token_accuracy": 0.0049040173180401325, + "num_tokens": 7438579389.0, + "step": 14551 + }, + { + "epoch": 3.9351000540832883, + "grad_norm": 5.78125, + "learning_rate": 0.004058652815628768, + "loss": 2.9357, + "mean_token_accuracy": 0.4357559084892273, + "num_tokens": 7439103628.0, + "step": 14552 + }, + { + "epoch": 3.935370470524608, + "grad_norm": 2.53125, + "learning_rate": 0.004057649456366797, + "loss": 2.9564, + "mean_token_accuracy": 0.42877012491226196, + "num_tokens": 7439602880.0, + "step": 14553 + }, + { + "epoch": 3.9356408869659276, + "grad_norm": 2.46875, + "learning_rate": 0.00405664631011656, + "loss": 2.8606, + "mean_token_accuracy": 0.4462394118309021, + "num_tokens": 7440073015.0, + "step": 14554 + }, + { + "epoch": 3.9359113034072473, + "grad_norm": 3.171875, + "learning_rate": 0.0040556433769088325, + "loss": 2.8521, + "mean_token_accuracy": 0.43134158849716187, + "num_tokens": 7440597222.0, + "step": 14555 + }, + { + "epoch": 3.936181719848567, + "grad_norm": 3.40625, + "learning_rate": 0.004054640656774387, + "loss": 2.9264, + "mean_token_accuracy": 0.4193502962589264, + "num_tokens": 7441096468.0, + "step": 14556 + }, + { + "epoch": 3.9364521362898865, + "grad_norm": 3.453125, + "learning_rate": 0.004053638149743992, + "loss": 2.6995, + "mean_token_accuracy": 0.4483298063278198, + "num_tokens": 7441620646.0, + "step": 14557 + }, + { + "epoch": 3.936722552731206, + "grad_norm": 3.09375, + "learning_rate": 0.0040526358558484075, + "loss": 2.7172, + "mean_token_accuracy": 0.4319509267807007, + "num_tokens": 7442144829.0, + "step": 14558 + }, + { + "epoch": 3.936992969172526, + "grad_norm": 3.171875, + "learning_rate": 0.004051633775118383, + "loss": 2.9603, + "mean_token_accuracy": 0.43794307112693787, + "num_tokens": 7442626744.0, + "step": 14559 + }, + { + "epoch": 3.9372633856138455, + "grad_norm": 2.890625, + "learning_rate": 0.0040506319075846705, + "loss": 2.6927, + "mean_token_accuracy": 0.4419079124927521, + "num_tokens": 7443150742.0, + "step": 14560 + }, + { + "epoch": 3.9375338020551647, + "grad_norm": 2.734375, + "learning_rate": 0.004049630253278008, + "loss": 2.9078, + "mean_token_accuracy": 0.4246842861175537, + "num_tokens": 7443674944.0, + "step": 14561 + }, + { + "epoch": 3.9378042184964848, + "grad_norm": 4.1875, + "learning_rate": 0.004048628812229126, + "loss": 2.8959, + "mean_token_accuracy": 0.44685035943984985, + "num_tokens": 7444153190.0, + "step": 14562 + }, + { + "epoch": 3.938074634937804, + "grad_norm": 2.921875, + "learning_rate": 0.004047627584468758, + "loss": 2.7977, + "mean_token_accuracy": 0.447030246257782, + "num_tokens": 7444637531.0, + "step": 14563 + }, + { + "epoch": 3.938345051379124, + "grad_norm": 2.796875, + "learning_rate": 0.004046626570027618, + "loss": 2.9671, + "mean_token_accuracy": 0.40874916315078735, + "num_tokens": 7445161806.0, + "step": 14564 + }, + { + "epoch": 3.9386154678204432, + "grad_norm": 5.5625, + "learning_rate": 0.004045625768936428, + "loss": 2.6211, + "mean_token_accuracy": 0.4795943796634674, + "num_tokens": 7445685908.0, + "step": 14565 + }, + { + "epoch": 3.9388858842617633, + "grad_norm": 2.671875, + "learning_rate": 0.004044625181225886, + "loss": 2.9388, + "mean_token_accuracy": 0.42268484830856323, + "num_tokens": 7446210153.0, + "step": 14566 + }, + { + "epoch": 3.9391563007030825, + "grad_norm": 3.21875, + "learning_rate": 0.004043624806926702, + "loss": 2.8924, + "mean_token_accuracy": 0.4343191385269165, + "num_tokens": 7446676293.0, + "step": 14567 + }, + { + "epoch": 3.9394267171444026, + "grad_norm": 3.140625, + "learning_rate": 0.004042624646069562, + "loss": 2.9607, + "mean_token_accuracy": 0.41170674562454224, + "num_tokens": 7447200523.0, + "step": 14568 + }, + { + "epoch": 3.939697133585722, + "grad_norm": 3.546875, + "learning_rate": 0.004041624698685162, + "loss": 2.978, + "mean_token_accuracy": 0.4357379078865051, + "num_tokens": 7447688966.0, + "step": 14569 + }, + { + "epoch": 3.939967550027042, + "grad_norm": 3.109375, + "learning_rate": 0.004040624964804179, + "loss": 2.9263, + "mean_token_accuracy": 0.4464464783668518, + "num_tokens": 7448156056.0, + "step": 14570 + }, + { + "epoch": 3.940237966468361, + "grad_norm": 45.25, + "learning_rate": 0.004039625444457286, + "loss": 9.0864, + "mean_token_accuracy": 0.026871267706155777, + "num_tokens": 7448680197.0, + "step": 14571 + }, + { + "epoch": 3.9405083829096808, + "grad_norm": 6.40625, + "learning_rate": 0.004038626137675157, + "loss": 2.8548, + "mean_token_accuracy": 0.4335136115550995, + "num_tokens": 7449204402.0, + "step": 14572 + }, + { + "epoch": 3.9407787993510004, + "grad_norm": 2.359375, + "learning_rate": 0.004037627044488448, + "loss": 2.9383, + "mean_token_accuracy": 0.4277905821800232, + "num_tokens": 7449724542.0, + "step": 14573 + }, + { + "epoch": 3.94104921579232, + "grad_norm": 2.78125, + "learning_rate": 0.004036628164927815, + "loss": 2.7545, + "mean_token_accuracy": 0.4297766089439392, + "num_tokens": 7450248811.0, + "step": 14574 + }, + { + "epoch": 3.9413196322336397, + "grad_norm": 3.578125, + "learning_rate": 0.00403562949902391, + "loss": 2.9795, + "mean_token_accuracy": 0.44129180908203125, + "num_tokens": 7450726307.0, + "step": 14575 + }, + { + "epoch": 3.9415900486749593, + "grad_norm": 2.640625, + "learning_rate": 0.004034631046807371, + "loss": 2.9632, + "mean_token_accuracy": 0.42334291338920593, + "num_tokens": 7451250574.0, + "step": 14576 + }, + { + "epoch": 3.941860465116279, + "grad_norm": 2.9375, + "learning_rate": 0.004033632808308837, + "loss": 2.6408, + "mean_token_accuracy": 0.4579678177833557, + "num_tokens": 7451774755.0, + "step": 14577 + }, + { + "epoch": 3.9421308815575986, + "grad_norm": 2.671875, + "learning_rate": 0.004032634783558935, + "loss": 2.7984, + "mean_token_accuracy": 0.44975146651268005, + "num_tokens": 7452283284.0, + "step": 14578 + }, + { + "epoch": 3.9424012979989183, + "grad_norm": 2.921875, + "learning_rate": 0.004031636972588286, + "loss": 2.6908, + "mean_token_accuracy": 0.44729581475257874, + "num_tokens": 7452807537.0, + "step": 14579 + }, + { + "epoch": 3.942671714440238, + "grad_norm": 3.03125, + "learning_rate": 0.004030639375427508, + "loss": 2.7289, + "mean_token_accuracy": 0.46052321791648865, + "num_tokens": 7453286894.0, + "step": 14580 + }, + { + "epoch": 3.9429421308815575, + "grad_norm": 3.421875, + "learning_rate": 0.004029641992107209, + "loss": 2.8349, + "mean_token_accuracy": 0.42880263924598694, + "num_tokens": 7453811127.0, + "step": 14581 + }, + { + "epoch": 3.943212547322877, + "grad_norm": 3.375, + "learning_rate": 0.004028644822657992, + "loss": 3.1087, + "mean_token_accuracy": 0.39383465051651, + "num_tokens": 7454335397.0, + "step": 14582 + }, + { + "epoch": 3.943482963764197, + "grad_norm": 2.890625, + "learning_rate": 0.004027647867110455, + "loss": 2.6669, + "mean_token_accuracy": 0.45800548791885376, + "num_tokens": 7454801380.0, + "step": 14583 + }, + { + "epoch": 3.9437533802055165, + "grad_norm": 2.8125, + "learning_rate": 0.004026651125495186, + "loss": 2.6766, + "mean_token_accuracy": 0.4606325030326843, + "num_tokens": 7455325608.0, + "step": 14584 + }, + { + "epoch": 3.944023796646836, + "grad_norm": 3.515625, + "learning_rate": 0.004025654597842766, + "loss": 2.8826, + "mean_token_accuracy": 0.38983777165412903, + "num_tokens": 7455849786.0, + "step": 14585 + }, + { + "epoch": 3.9442942130881558, + "grad_norm": 3.484375, + "learning_rate": 0.004024658284183775, + "loss": 2.8791, + "mean_token_accuracy": 0.4420683681964874, + "num_tokens": 7456352622.0, + "step": 14586 + }, + { + "epoch": 3.9445646295294754, + "grad_norm": 3.40625, + "learning_rate": 0.004023662184548781, + "loss": 2.8216, + "mean_token_accuracy": 0.4547693133354187, + "num_tokens": 7456824832.0, + "step": 14587 + }, + { + "epoch": 3.944835045970795, + "grad_norm": 2.953125, + "learning_rate": 0.004022666298968345, + "loss": 2.8343, + "mean_token_accuracy": 0.43601134419441223, + "num_tokens": 7457348970.0, + "step": 14588 + }, + { + "epoch": 3.9451054624121147, + "grad_norm": 3.40625, + "learning_rate": 0.004021670627473027, + "loss": 2.7765, + "mean_token_accuracy": 0.4275106191635132, + "num_tokens": 7457873204.0, + "step": 14589 + }, + { + "epoch": 3.9453758788534343, + "grad_norm": 3.109375, + "learning_rate": 0.004020675170093378, + "loss": 2.9251, + "mean_token_accuracy": 0.4261249303817749, + "num_tokens": 7458397341.0, + "step": 14590 + }, + { + "epoch": 3.945646295294754, + "grad_norm": 123.0, + "learning_rate": 0.004019679926859936, + "loss": 10.4876, + "mean_token_accuracy": 0.014932285994291306, + "num_tokens": 7458862345.0, + "step": 14591 + }, + { + "epoch": 3.9459167117360736, + "grad_norm": 7.5, + "learning_rate": 0.004018684897803244, + "loss": 3.1291, + "mean_token_accuracy": 0.4019260108470917, + "num_tokens": 7459324786.0, + "step": 14592 + }, + { + "epoch": 3.9461871281773933, + "grad_norm": 12.75, + "learning_rate": 0.004017690082953829, + "loss": 2.8489, + "mean_token_accuracy": 0.43750303983688354, + "num_tokens": 7459849045.0, + "step": 14593 + }, + { + "epoch": 3.946457544618713, + "grad_norm": 2.859375, + "learning_rate": 0.004016695482342215, + "loss": 2.7014, + "mean_token_accuracy": 0.44300562143325806, + "num_tokens": 7460373243.0, + "step": 14594 + }, + { + "epoch": 3.9467279610600325, + "grad_norm": 2.46875, + "learning_rate": 0.004015701095998922, + "loss": 2.7879, + "mean_token_accuracy": 0.4270745813846588, + "num_tokens": 7460897503.0, + "step": 14595 + }, + { + "epoch": 3.946998377501352, + "grad_norm": 2.703125, + "learning_rate": 0.0040147069239544605, + "loss": 2.9293, + "mean_token_accuracy": 0.4315199851989746, + "num_tokens": 7461397152.0, + "step": 14596 + }, + { + "epoch": 3.947268793942672, + "grad_norm": 3.9375, + "learning_rate": 0.004013712966239329, + "loss": 2.8856, + "mean_token_accuracy": 0.42154234647750854, + "num_tokens": 7461921426.0, + "step": 14597 + }, + { + "epoch": 3.9475392103839915, + "grad_norm": 2.78125, + "learning_rate": 0.004012719222884032, + "loss": 2.7968, + "mean_token_accuracy": 0.45220041275024414, + "num_tokens": 7462440272.0, + "step": 14598 + }, + { + "epoch": 3.947809626825311, + "grad_norm": 3.828125, + "learning_rate": 0.004011725693919055, + "loss": 2.832, + "mean_token_accuracy": 0.40675032138824463, + "num_tokens": 7462964478.0, + "step": 14599 + }, + { + "epoch": 3.9480800432666308, + "grad_norm": 3.3125, + "learning_rate": 0.004010732379374887, + "loss": 2.9184, + "mean_token_accuracy": 0.430599570274353, + "num_tokens": 7463475904.0, + "step": 14600 + }, + { + "epoch": 3.9483504597079504, + "grad_norm": 3.34375, + "learning_rate": 0.004009739279282005, + "loss": 2.9719, + "mean_token_accuracy": 0.42497703433036804, + "num_tokens": 7464000104.0, + "step": 14601 + }, + { + "epoch": 3.9486208761492696, + "grad_norm": 3.578125, + "learning_rate": 0.004008746393670875, + "loss": 2.9597, + "mean_token_accuracy": 0.42585980892181396, + "num_tokens": 7464524352.0, + "step": 14602 + }, + { + "epoch": 3.9488912925905897, + "grad_norm": 2.96875, + "learning_rate": 0.004007753722571969, + "loss": 2.9556, + "mean_token_accuracy": 0.4250715672969818, + "num_tokens": 7465048615.0, + "step": 14603 + }, + { + "epoch": 3.949161709031909, + "grad_norm": 3.359375, + "learning_rate": 0.00400676126601574, + "loss": 2.801, + "mean_token_accuracy": 0.4315730333328247, + "num_tokens": 7465572825.0, + "step": 14604 + }, + { + "epoch": 3.949432125473229, + "grad_norm": 3.359375, + "learning_rate": 0.0040057690240326415, + "loss": 2.7069, + "mean_token_accuracy": 0.4620886445045471, + "num_tokens": 7466097039.0, + "step": 14605 + }, + { + "epoch": 3.949702541914548, + "grad_norm": 4.75, + "learning_rate": 0.004004776996653118, + "loss": 2.8446, + "mean_token_accuracy": 0.4290159344673157, + "num_tokens": 7466586748.0, + "step": 14606 + }, + { + "epoch": 3.9499729583558683, + "grad_norm": 2.734375, + "learning_rate": 0.004003785183907606, + "loss": 3.0186, + "mean_token_accuracy": 0.4230247139930725, + "num_tokens": 7467097381.0, + "step": 14607 + }, + { + "epoch": 3.9502433747971875, + "grad_norm": 3.859375, + "learning_rate": 0.004002793585826542, + "loss": 2.8628, + "mean_token_accuracy": 0.435404509305954, + "num_tokens": 7467621490.0, + "step": 14608 + }, + { + "epoch": 3.9505137912385075, + "grad_norm": 3.09375, + "learning_rate": 0.004001802202440346, + "loss": 3.0583, + "mean_token_accuracy": 0.40640687942504883, + "num_tokens": 7468107614.0, + "step": 14609 + }, + { + "epoch": 3.9507842076798267, + "grad_norm": 3.515625, + "learning_rate": 0.00400081103377944, + "loss": 2.8217, + "mean_token_accuracy": 0.4117673635482788, + "num_tokens": 7468631770.0, + "step": 14610 + }, + { + "epoch": 3.951054624121147, + "grad_norm": 56.25, + "learning_rate": 0.003999820079874233, + "loss": 10.5328, + "mean_token_accuracy": 0.020099017769098282, + "num_tokens": 7469156013.0, + "step": 14611 + }, + { + "epoch": 3.951325040562466, + "grad_norm": 5.03125, + "learning_rate": 0.003998829340755136, + "loss": 2.9817, + "mean_token_accuracy": 0.41321343183517456, + "num_tokens": 7469680271.0, + "step": 14612 + }, + { + "epoch": 3.9515954570037857, + "grad_norm": 3.5625, + "learning_rate": 0.003997838816452542, + "loss": 2.8903, + "mean_token_accuracy": 0.4099959135055542, + "num_tokens": 7470204474.0, + "step": 14613 + }, + { + "epoch": 3.9518658734451053, + "grad_norm": 2.59375, + "learning_rate": 0.003996848506996843, + "loss": 2.6877, + "mean_token_accuracy": 0.4321996569633484, + "num_tokens": 7470694924.0, + "step": 14614 + }, + { + "epoch": 3.952136289886425, + "grad_norm": 2.796875, + "learning_rate": 0.003995858412418429, + "loss": 2.737, + "mean_token_accuracy": 0.451353520154953, + "num_tokens": 7471195892.0, + "step": 14615 + }, + { + "epoch": 3.9524067063277446, + "grad_norm": 3.171875, + "learning_rate": 0.003994868532747677, + "loss": 2.7454, + "mean_token_accuracy": 0.43844878673553467, + "num_tokens": 7471720085.0, + "step": 14616 + }, + { + "epoch": 3.9526771227690642, + "grad_norm": 2.65625, + "learning_rate": 0.003993878868014959, + "loss": 2.8256, + "mean_token_accuracy": 0.4311486482620239, + "num_tokens": 7472244230.0, + "step": 14617 + }, + { + "epoch": 3.952947539210384, + "grad_norm": 3.140625, + "learning_rate": 0.003992889418250641, + "loss": 2.8019, + "mean_token_accuracy": 0.45749831199645996, + "num_tokens": 7472703386.0, + "step": 14618 + }, + { + "epoch": 3.9532179556517035, + "grad_norm": 3.390625, + "learning_rate": 0.003991900183485082, + "loss": 2.9399, + "mean_token_accuracy": 0.43360069394111633, + "num_tokens": 7473227378.0, + "step": 14619 + }, + { + "epoch": 3.953488372093023, + "grad_norm": 3.609375, + "learning_rate": 0.0039909111637486325, + "loss": 2.9072, + "mean_token_accuracy": 0.4093945622444153, + "num_tokens": 7473751433.0, + "step": 14620 + }, + { + "epoch": 3.953758788534343, + "grad_norm": 2.640625, + "learning_rate": 0.003989922359071642, + "loss": 2.758, + "mean_token_accuracy": 0.43358272314071655, + "num_tokens": 7474275583.0, + "step": 14621 + }, + { + "epoch": 3.9540292049756625, + "grad_norm": 3.28125, + "learning_rate": 0.003988933769484451, + "loss": 2.9221, + "mean_token_accuracy": 0.42279356718063354, + "num_tokens": 7474799807.0, + "step": 14622 + }, + { + "epoch": 3.954299621416982, + "grad_norm": 2.84375, + "learning_rate": 0.003987945395017387, + "loss": 2.8175, + "mean_token_accuracy": 0.4351887106895447, + "num_tokens": 7475323788.0, + "step": 14623 + }, + { + "epoch": 3.9545700378583017, + "grad_norm": 3.5, + "learning_rate": 0.003986957235700782, + "loss": 2.7398, + "mean_token_accuracy": 0.4422711133956909, + "num_tokens": 7475842378.0, + "step": 14624 + }, + { + "epoch": 3.9548404542996214, + "grad_norm": 2.828125, + "learning_rate": 0.003985969291564952, + "loss": 2.7601, + "mean_token_accuracy": 0.4510722756385803, + "num_tokens": 7476366587.0, + "step": 14625 + }, + { + "epoch": 3.955110870740941, + "grad_norm": 3.609375, + "learning_rate": 0.003984981562640211, + "loss": 3.005, + "mean_token_accuracy": 0.4278187155723572, + "num_tokens": 7476877061.0, + "step": 14626 + }, + { + "epoch": 3.9553812871822607, + "grad_norm": 3.28125, + "learning_rate": 0.0039839940489568675, + "loss": 2.6869, + "mean_token_accuracy": 0.4287419319152832, + "num_tokens": 7477401246.0, + "step": 14627 + }, + { + "epoch": 3.9556517036235803, + "grad_norm": 2.734375, + "learning_rate": 0.003983006750545215, + "loss": 2.8381, + "mean_token_accuracy": 0.4452820122241974, + "num_tokens": 7477925453.0, + "step": 14628 + }, + { + "epoch": 3.9559221200649, + "grad_norm": 3.296875, + "learning_rate": 0.0039820196674355554, + "loss": 2.9217, + "mean_token_accuracy": 0.43321824073791504, + "num_tokens": 7478425903.0, + "step": 14629 + }, + { + "epoch": 3.9561925365062196, + "grad_norm": 3.671875, + "learning_rate": 0.003981032799658169, + "loss": 2.9738, + "mean_token_accuracy": 0.43005502223968506, + "num_tokens": 7478950077.0, + "step": 14630 + }, + { + "epoch": 3.9564629529475392, + "grad_norm": 16.5, + "learning_rate": 0.003980046147243338, + "loss": 9.1502, + "mean_token_accuracy": 0.004955017939209938, + "num_tokens": 7479474331.0, + "step": 14631 + }, + { + "epoch": 3.956733369388859, + "grad_norm": 6.90625, + "learning_rate": 0.003979059710221336, + "loss": 2.9342, + "mean_token_accuracy": 0.4431898295879364, + "num_tokens": 7479877422.0, + "step": 14632 + }, + { + "epoch": 3.9570037858301785, + "grad_norm": 3.046875, + "learning_rate": 0.00397807348862243, + "loss": 2.7673, + "mean_token_accuracy": 0.4403322637081146, + "num_tokens": 7480401640.0, + "step": 14633 + }, + { + "epoch": 3.957274202271498, + "grad_norm": 2.640625, + "learning_rate": 0.003977087482476877, + "loss": 2.7142, + "mean_token_accuracy": 0.44315841794013977, + "num_tokens": 7480874551.0, + "step": 14634 + }, + { + "epoch": 3.957544618712818, + "grad_norm": 2.6875, + "learning_rate": 0.0039761016918149344, + "loss": 2.5813, + "mean_token_accuracy": 0.46077656745910645, + "num_tokens": 7481398719.0, + "step": 14635 + }, + { + "epoch": 3.9578150351541375, + "grad_norm": 3.046875, + "learning_rate": 0.003975116116666847, + "loss": 2.7344, + "mean_token_accuracy": 0.4516420066356659, + "num_tokens": 7481922890.0, + "step": 14636 + }, + { + "epoch": 3.958085451595457, + "grad_norm": 2.875, + "learning_rate": 0.0039741307570628544, + "loss": 2.5151, + "mean_token_accuracy": 0.4530535340309143, + "num_tokens": 7482447124.0, + "step": 14637 + }, + { + "epoch": 3.9583558680367767, + "grad_norm": 2.90625, + "learning_rate": 0.003973145613033196, + "loss": 2.8821, + "mean_token_accuracy": 0.4281293451786041, + "num_tokens": 7482969062.0, + "step": 14638 + }, + { + "epoch": 3.9586262844780964, + "grad_norm": 3.546875, + "learning_rate": 0.003972160684608094, + "loss": 2.9233, + "mean_token_accuracy": 0.3852924108505249, + "num_tokens": 7483493278.0, + "step": 14639 + }, + { + "epoch": 3.958896700919416, + "grad_norm": 2.796875, + "learning_rate": 0.003971175971817767, + "loss": 2.7096, + "mean_token_accuracy": 0.47029799222946167, + "num_tokens": 7483982554.0, + "step": 14640 + }, + { + "epoch": 3.9591671173607357, + "grad_norm": 3.9375, + "learning_rate": 0.003970191474692435, + "loss": 2.9842, + "mean_token_accuracy": 0.43002310395240784, + "num_tokens": 7484506769.0, + "step": 14641 + }, + { + "epoch": 3.9594375338020553, + "grad_norm": 3.09375, + "learning_rate": 0.003969207193262301, + "loss": 2.6925, + "mean_token_accuracy": 0.431424617767334, + "num_tokens": 7485030864.0, + "step": 14642 + }, + { + "epoch": 3.959707950243375, + "grad_norm": 3.140625, + "learning_rate": 0.003968223127557565, + "loss": 2.8122, + "mean_token_accuracy": 0.4282627999782562, + "num_tokens": 7485555090.0, + "step": 14643 + }, + { + "epoch": 3.9599783666846946, + "grad_norm": 3.09375, + "learning_rate": 0.003967239277608424, + "loss": 2.9153, + "mean_token_accuracy": 0.45585212111473083, + "num_tokens": 7486019457.0, + "step": 14644 + }, + { + "epoch": 3.960248783126014, + "grad_norm": 2.71875, + "learning_rate": 0.003966255643445062, + "loss": 2.8255, + "mean_token_accuracy": 0.42365121841430664, + "num_tokens": 7486543625.0, + "step": 14645 + }, + { + "epoch": 3.960519199567334, + "grad_norm": 2.734375, + "learning_rate": 0.003965272225097665, + "loss": 2.9055, + "mean_token_accuracy": 0.41755804419517517, + "num_tokens": 7487067901.0, + "step": 14646 + }, + { + "epoch": 3.960789616008653, + "grad_norm": 3.796875, + "learning_rate": 0.003964289022596402, + "loss": 3.0115, + "mean_token_accuracy": 0.42586445808410645, + "num_tokens": 7487592055.0, + "step": 14647 + }, + { + "epoch": 3.961060032449973, + "grad_norm": 3.53125, + "learning_rate": 0.003963306035971441, + "loss": 2.9894, + "mean_token_accuracy": 0.4207989573478699, + "num_tokens": 7488116308.0, + "step": 14648 + }, + { + "epoch": 3.9613304488912924, + "grad_norm": 3.546875, + "learning_rate": 0.0039623232652529474, + "loss": 2.831, + "mean_token_accuracy": 0.4069584012031555, + "num_tokens": 7488640423.0, + "step": 14649 + }, + { + "epoch": 3.9616008653326125, + "grad_norm": 7.84375, + "learning_rate": 0.00396134071047107, + "loss": 2.8792, + "mean_token_accuracy": 0.4527530074119568, + "num_tokens": 7489147690.0, + "step": 14650 + }, + { + "epoch": 3.9618712817739317, + "grad_norm": 31.875, + "learning_rate": 0.0039603583716559575, + "loss": 10.247, + "mean_token_accuracy": 0.03646766021847725, + "num_tokens": 7489671935.0, + "step": 14651 + }, + { + "epoch": 3.9621416982152518, + "grad_norm": 3.40625, + "learning_rate": 0.003959376248837756, + "loss": 2.641, + "mean_token_accuracy": 0.46436983346939087, + "num_tokens": 7490196216.0, + "step": 14652 + }, + { + "epoch": 3.962412114656571, + "grad_norm": 3.0, + "learning_rate": 0.003958394342046596, + "loss": 2.8141, + "mean_token_accuracy": 0.44505950808525085, + "num_tokens": 7490680801.0, + "step": 14653 + }, + { + "epoch": 3.9626825310978906, + "grad_norm": 3.375, + "learning_rate": 0.003957412651312603, + "loss": 2.933, + "mean_token_accuracy": 0.4191156029701233, + "num_tokens": 7491205088.0, + "step": 14654 + }, + { + "epoch": 3.9629529475392102, + "grad_norm": 4.28125, + "learning_rate": 0.003956431176665905, + "loss": 2.8521, + "mean_token_accuracy": 0.42208775877952576, + "num_tokens": 7491675176.0, + "step": 14655 + }, + { + "epoch": 3.96322336398053, + "grad_norm": 3.734375, + "learning_rate": 0.00395544991813661, + "loss": 2.8162, + "mean_token_accuracy": 0.4298396706581116, + "num_tokens": 7492199438.0, + "step": 14656 + }, + { + "epoch": 3.9634937804218495, + "grad_norm": 3.875, + "learning_rate": 0.003954468875754826, + "loss": 2.6801, + "mean_token_accuracy": 0.42430031299591064, + "num_tokens": 7492675697.0, + "step": 14657 + }, + { + "epoch": 3.963764196863169, + "grad_norm": 2.921875, + "learning_rate": 0.003953488049550659, + "loss": 2.9831, + "mean_token_accuracy": 0.43155497312545776, + "num_tokens": 7493165122.0, + "step": 14658 + }, + { + "epoch": 3.964034613304489, + "grad_norm": 3.609375, + "learning_rate": 0.003952507439554203, + "loss": 2.8878, + "mean_token_accuracy": 0.4430689811706543, + "num_tokens": 7493689294.0, + "step": 14659 + }, + { + "epoch": 3.9643050297458085, + "grad_norm": 93.5, + "learning_rate": 0.003951527045795539, + "loss": 3.9981, + "mean_token_accuracy": 0.3902297616004944, + "num_tokens": 7494201698.0, + "step": 14660 + }, + { + "epoch": 3.964575446187128, + "grad_norm": 4.8125, + "learning_rate": 0.003950546868304757, + "loss": 2.8273, + "mean_token_accuracy": 0.4502595067024231, + "num_tokens": 7494666567.0, + "step": 14661 + }, + { + "epoch": 3.9648458626284477, + "grad_norm": 2.171875, + "learning_rate": 0.003949566907111929, + "loss": 2.9298, + "mean_token_accuracy": 0.43129003047943115, + "num_tokens": 7495190844.0, + "step": 14662 + }, + { + "epoch": 3.9651162790697674, + "grad_norm": 3.34375, + "learning_rate": 0.0039485871622471184, + "loss": 2.9579, + "mean_token_accuracy": 0.41358816623687744, + "num_tokens": 7495715089.0, + "step": 14663 + }, + { + "epoch": 3.965386695511087, + "grad_norm": 3.65625, + "learning_rate": 0.003947607633740391, + "loss": 2.9062, + "mean_token_accuracy": 0.43145865201950073, + "num_tokens": 7496239297.0, + "step": 14664 + }, + { + "epoch": 3.9656571119524067, + "grad_norm": 3.1875, + "learning_rate": 0.003946628321621804, + "loss": 2.8001, + "mean_token_accuracy": 0.43047285079956055, + "num_tokens": 7496763378.0, + "step": 14665 + }, + { + "epoch": 3.9659275283937263, + "grad_norm": 3.21875, + "learning_rate": 0.0039456492259213995, + "loss": 2.8195, + "mean_token_accuracy": 0.46125364303588867, + "num_tokens": 7497287588.0, + "step": 14666 + }, + { + "epoch": 3.966197944835046, + "grad_norm": 3.296875, + "learning_rate": 0.003944670346669225, + "loss": 3.0264, + "mean_token_accuracy": 0.4111135005950928, + "num_tokens": 7497811860.0, + "step": 14667 + }, + { + "epoch": 3.9664683612763656, + "grad_norm": 3.59375, + "learning_rate": 0.00394369168389531, + "loss": 2.7259, + "mean_token_accuracy": 0.4287281036376953, + "num_tokens": 7498336072.0, + "step": 14668 + }, + { + "epoch": 3.9667387777176852, + "grad_norm": 3.09375, + "learning_rate": 0.003942713237629689, + "loss": 2.6844, + "mean_token_accuracy": 0.4357457160949707, + "num_tokens": 7498844475.0, + "step": 14669 + }, + { + "epoch": 3.967009194159005, + "grad_norm": 2.984375, + "learning_rate": 0.003941735007902379, + "loss": 2.7638, + "mean_token_accuracy": 0.4473225176334381, + "num_tokens": 7499307315.0, + "step": 14670 + }, + { + "epoch": 3.9672796106003245, + "grad_norm": 35.75, + "learning_rate": 0.003940756994743393, + "loss": 9.8426, + "mean_token_accuracy": 0.0010246498277410865, + "num_tokens": 7499831398.0, + "step": 14671 + }, + { + "epoch": 3.967550027041644, + "grad_norm": 6.34375, + "learning_rate": 0.003939779198182745, + "loss": 3.0994, + "mean_token_accuracy": 0.4211757779121399, + "num_tokens": 7500355441.0, + "step": 14672 + }, + { + "epoch": 3.967820443482964, + "grad_norm": 2.421875, + "learning_rate": 0.003938801618250436, + "loss": 2.6294, + "mean_token_accuracy": 0.4397463798522949, + "num_tokens": 7500879472.0, + "step": 14673 + }, + { + "epoch": 3.9680908599242835, + "grad_norm": 2.59375, + "learning_rate": 0.003937824254976454, + "loss": 2.6285, + "mean_token_accuracy": 0.46075254678726196, + "num_tokens": 7501403749.0, + "step": 14674 + }, + { + "epoch": 3.968361276365603, + "grad_norm": 2.734375, + "learning_rate": 0.003936847108390797, + "loss": 2.8552, + "mean_token_accuracy": 0.44649726152420044, + "num_tokens": 7501927930.0, + "step": 14675 + }, + { + "epoch": 3.9686316928069227, + "grad_norm": 3.4375, + "learning_rate": 0.003935870178523439, + "loss": 2.6382, + "mean_token_accuracy": 0.4390851557254791, + "num_tokens": 7502452085.0, + "step": 14676 + }, + { + "epoch": 3.9689021092482424, + "grad_norm": 3.234375, + "learning_rate": 0.003934893465404359, + "loss": 2.8567, + "mean_token_accuracy": 0.4695836007595062, + "num_tokens": 7502911809.0, + "step": 14677 + }, + { + "epoch": 3.969172525689562, + "grad_norm": 3.8125, + "learning_rate": 0.003933916969063525, + "loss": 2.8914, + "mean_token_accuracy": 0.4345530867576599, + "num_tokens": 7503436063.0, + "step": 14678 + }, + { + "epoch": 3.9694429421308817, + "grad_norm": 3.703125, + "learning_rate": 0.003932940689530897, + "loss": 2.8421, + "mean_token_accuracy": 0.4379609227180481, + "num_tokens": 7503933955.0, + "step": 14679 + }, + { + "epoch": 3.9697133585722013, + "grad_norm": 2.953125, + "learning_rate": 0.0039319646268364305, + "loss": 2.7151, + "mean_token_accuracy": 0.4241151809692383, + "num_tokens": 7504458176.0, + "step": 14680 + }, + { + "epoch": 3.969983775013521, + "grad_norm": 2.984375, + "learning_rate": 0.003930988781010078, + "loss": 2.9418, + "mean_token_accuracy": 0.42452532052993774, + "num_tokens": 7504982442.0, + "step": 14681 + }, + { + "epoch": 3.9702541914548406, + "grad_norm": 3.453125, + "learning_rate": 0.003930013152081778, + "loss": 3.0374, + "mean_token_accuracy": 0.4187886118888855, + "num_tokens": 7505452495.0, + "step": 14682 + }, + { + "epoch": 3.9705246078961602, + "grad_norm": 3.484375, + "learning_rate": 0.003929037740081464, + "loss": 2.8242, + "mean_token_accuracy": 0.4372629225254059, + "num_tokens": 7505938787.0, + "step": 14683 + }, + { + "epoch": 3.97079502433748, + "grad_norm": 3.125, + "learning_rate": 0.003928062545039069, + "loss": 3.0582, + "mean_token_accuracy": 0.40888458490371704, + "num_tokens": 7506463066.0, + "step": 14684 + }, + { + "epoch": 3.9710654407787995, + "grad_norm": 2.890625, + "learning_rate": 0.0039270875669845115, + "loss": 2.7549, + "mean_token_accuracy": 0.42490386962890625, + "num_tokens": 7506987194.0, + "step": 14685 + }, + { + "epoch": 3.9713358572201187, + "grad_norm": 2.65625, + "learning_rate": 0.003926112805947706, + "loss": 2.8656, + "mean_token_accuracy": 0.43897998332977295, + "num_tokens": 7507483858.0, + "step": 14686 + }, + { + "epoch": 3.971606273661439, + "grad_norm": 3.375, + "learning_rate": 0.003925138261958565, + "loss": 2.7534, + "mean_token_accuracy": 0.4420504868030548, + "num_tokens": 7508008053.0, + "step": 14687 + }, + { + "epoch": 3.971876690102758, + "grad_norm": 3.046875, + "learning_rate": 0.003924163935046987, + "loss": 2.8798, + "mean_token_accuracy": 0.42208409309387207, + "num_tokens": 7508532330.0, + "step": 14688 + }, + { + "epoch": 3.972147106544078, + "grad_norm": 3.484375, + "learning_rate": 0.003923189825242867, + "loss": 2.8959, + "mean_token_accuracy": 0.43724316358566284, + "num_tokens": 7509034743.0, + "step": 14689 + }, + { + "epoch": 3.9724175229853973, + "grad_norm": 2.984375, + "learning_rate": 0.003922215932576095, + "loss": 2.7463, + "mean_token_accuracy": 0.46352070569992065, + "num_tokens": 7509445138.0, + "step": 14690 + }, + { + "epoch": 3.9726879394267174, + "grad_norm": 252.0, + "learning_rate": 0.003921242257076553, + "loss": 11.2415, + "mean_token_accuracy": 0.024735266342759132, + "num_tokens": 7509948773.0, + "step": 14691 + }, + { + "epoch": 3.9729583558680366, + "grad_norm": 7.78125, + "learning_rate": 0.003920268798774117, + "loss": 3.0324, + "mean_token_accuracy": 0.4074332118034363, + "num_tokens": 7510473005.0, + "step": 14692 + }, + { + "epoch": 3.9732287723093567, + "grad_norm": 4.8125, + "learning_rate": 0.003919295557698651, + "loss": 2.4727, + "mean_token_accuracy": 0.4844210147857666, + "num_tokens": 7510985240.0, + "step": 14693 + }, + { + "epoch": 3.973499188750676, + "grad_norm": 3.03125, + "learning_rate": 0.003918322533880021, + "loss": 2.7979, + "mean_token_accuracy": 0.43230342864990234, + "num_tokens": 7511472722.0, + "step": 14694 + }, + { + "epoch": 3.9737696051919955, + "grad_norm": 2.859375, + "learning_rate": 0.003917349727348084, + "loss": 2.8214, + "mean_token_accuracy": 0.4352529048919678, + "num_tokens": 7511996953.0, + "step": 14695 + }, + { + "epoch": 3.974040021633315, + "grad_norm": 2.78125, + "learning_rate": 0.003916377138132685, + "loss": 2.7431, + "mean_token_accuracy": 0.4529457092285156, + "num_tokens": 7512521050.0, + "step": 14696 + }, + { + "epoch": 3.974310438074635, + "grad_norm": 2.734375, + "learning_rate": 0.003915404766263667, + "loss": 2.8849, + "mean_token_accuracy": 0.42652058601379395, + "num_tokens": 7513045302.0, + "step": 14697 + }, + { + "epoch": 3.9745808545159544, + "grad_norm": 3.1875, + "learning_rate": 0.0039144326117708655, + "loss": 2.8594, + "mean_token_accuracy": 0.435991495847702, + "num_tokens": 7513511226.0, + "step": 14698 + }, + { + "epoch": 3.974851270957274, + "grad_norm": 3.578125, + "learning_rate": 0.00391346067468411, + "loss": 2.4613, + "mean_token_accuracy": 0.5045474767684937, + "num_tokens": 7514035375.0, + "step": 14699 + }, + { + "epoch": 3.9751216873985937, + "grad_norm": 3.421875, + "learning_rate": 0.003912488955033218, + "loss": 2.7426, + "mean_token_accuracy": 0.4364118278026581, + "num_tokens": 7514559653.0, + "step": 14700 + }, + { + "epoch": 3.9753921038399134, + "grad_norm": 3.359375, + "learning_rate": 0.003911517452848011, + "loss": 2.8608, + "mean_token_accuracy": 0.41763195395469666, + "num_tokens": 7515083927.0, + "step": 14701 + }, + { + "epoch": 3.975662520281233, + "grad_norm": 3.875, + "learning_rate": 0.003910546168158294, + "loss": 2.9129, + "mean_token_accuracy": 0.4459787607192993, + "num_tokens": 7515608212.0, + "step": 14702 + }, + { + "epoch": 3.9759329367225527, + "grad_norm": 3.21875, + "learning_rate": 0.003909575100993866, + "loss": 2.9285, + "mean_token_accuracy": 0.42788341641426086, + "num_tokens": 7516132383.0, + "step": 14703 + }, + { + "epoch": 3.9762033531638723, + "grad_norm": 2.953125, + "learning_rate": 0.003908604251384529, + "loss": 2.9035, + "mean_token_accuracy": 0.4384874701499939, + "num_tokens": 7516626838.0, + "step": 14704 + }, + { + "epoch": 3.976473769605192, + "grad_norm": 3.09375, + "learning_rate": 0.003907633619360068, + "loss": 2.9406, + "mean_token_accuracy": 0.4305408000946045, + "num_tokens": 7517151108.0, + "step": 14705 + }, + { + "epoch": 3.9767441860465116, + "grad_norm": 3.4375, + "learning_rate": 0.003906663204950263, + "loss": 2.8884, + "mean_token_accuracy": 0.43012669682502747, + "num_tokens": 7517633371.0, + "step": 14706 + }, + { + "epoch": 3.9770146024878312, + "grad_norm": 3.765625, + "learning_rate": 0.0039056930081848894, + "loss": 2.94, + "mean_token_accuracy": 0.4299215078353882, + "num_tokens": 7518157538.0, + "step": 14707 + }, + { + "epoch": 3.977285018929151, + "grad_norm": 3.46875, + "learning_rate": 0.0039047230290937215, + "loss": 2.8965, + "mean_token_accuracy": 0.44199857115745544, + "num_tokens": 7518662445.0, + "step": 14708 + }, + { + "epoch": 3.9775554353704705, + "grad_norm": 3.828125, + "learning_rate": 0.0039037532677065135, + "loss": 2.9204, + "mean_token_accuracy": 0.4216901361942291, + "num_tokens": 7519186641.0, + "step": 14709 + }, + { + "epoch": 3.97782585181179, + "grad_norm": 2.8125, + "learning_rate": 0.003902783724053026, + "loss": 2.9272, + "mean_token_accuracy": 0.4472169578075409, + "num_tokens": 7519710812.0, + "step": 14710 + }, + { + "epoch": 3.97809626825311, + "grad_norm": 120.0, + "learning_rate": 0.003901814398163006, + "loss": 13.0523, + "mean_token_accuracy": 0.003581176046282053, + "num_tokens": 7520234984.0, + "step": 14711 + }, + { + "epoch": 3.9783666846944294, + "grad_norm": 6.28125, + "learning_rate": 0.0039008452900661916, + "loss": 2.8373, + "mean_token_accuracy": 0.4260730743408203, + "num_tokens": 7520759205.0, + "step": 14712 + }, + { + "epoch": 3.978637101135749, + "grad_norm": 2.90625, + "learning_rate": 0.003899876399792324, + "loss": 3.0737, + "mean_token_accuracy": 0.4229615330696106, + "num_tokens": 7521261681.0, + "step": 14713 + }, + { + "epoch": 3.9789075175770687, + "grad_norm": 3.703125, + "learning_rate": 0.003898907727371126, + "loss": 2.909, + "mean_token_accuracy": 0.4409411549568176, + "num_tokens": 7521741269.0, + "step": 14714 + }, + { + "epoch": 3.9791779340183884, + "grad_norm": 2.921875, + "learning_rate": 0.0038979392728323236, + "loss": 2.9412, + "mean_token_accuracy": 0.4293339252471924, + "num_tokens": 7522231566.0, + "step": 14715 + }, + { + "epoch": 3.979448350459708, + "grad_norm": 4.21875, + "learning_rate": 0.003896971036205632, + "loss": 2.9545, + "mean_token_accuracy": 0.43166226148605347, + "num_tokens": 7522755846.0, + "step": 14716 + }, + { + "epoch": 3.9797187669010277, + "grad_norm": 3.015625, + "learning_rate": 0.003896003017520754, + "loss": 2.8866, + "mean_token_accuracy": 0.4341268241405487, + "num_tokens": 7523280100.0, + "step": 14717 + }, + { + "epoch": 3.9799891833423473, + "grad_norm": 3.96875, + "learning_rate": 0.0038950352168073965, + "loss": 2.9534, + "mean_token_accuracy": 0.4230264127254486, + "num_tokens": 7523804168.0, + "step": 14718 + }, + { + "epoch": 3.980259599783667, + "grad_norm": 2.65625, + "learning_rate": 0.003894067634095254, + "loss": 2.7633, + "mean_token_accuracy": 0.4574214220046997, + "num_tokens": 7524328231.0, + "step": 14719 + }, + { + "epoch": 3.9805300162249866, + "grad_norm": 3.171875, + "learning_rate": 0.0038931002694140115, + "loss": 2.7885, + "mean_token_accuracy": 0.4311801493167877, + "num_tokens": 7524852473.0, + "step": 14720 + }, + { + "epoch": 3.9808004326663062, + "grad_norm": 3.234375, + "learning_rate": 0.003892133122793352, + "loss": 2.8456, + "mean_token_accuracy": 0.4170709550380707, + "num_tokens": 7525376646.0, + "step": 14721 + }, + { + "epoch": 3.981070849107626, + "grad_norm": 3.0625, + "learning_rate": 0.003891166194262954, + "loss": 2.8937, + "mean_token_accuracy": 0.46090441942214966, + "num_tokens": 7525839260.0, + "step": 14722 + }, + { + "epoch": 3.9813412655489455, + "grad_norm": 3.40625, + "learning_rate": 0.0038901994838524802, + "loss": 2.7822, + "mean_token_accuracy": 0.45367124676704407, + "num_tokens": 7526363398.0, + "step": 14723 + }, + { + "epoch": 3.981611681990265, + "grad_norm": 3.828125, + "learning_rate": 0.003889232991591597, + "loss": 2.735, + "mean_token_accuracy": 0.44421958923339844, + "num_tokens": 7526887452.0, + "step": 14724 + }, + { + "epoch": 3.981882098431585, + "grad_norm": 3.65625, + "learning_rate": 0.0038882667175099577, + "loss": 2.7944, + "mean_token_accuracy": 0.43128108978271484, + "num_tokens": 7527411643.0, + "step": 14725 + }, + { + "epoch": 3.9821525148729044, + "grad_norm": 3.328125, + "learning_rate": 0.0038873006616372075, + "loss": 2.8684, + "mean_token_accuracy": 0.43401867151260376, + "num_tokens": 7527935892.0, + "step": 14726 + }, + { + "epoch": 3.9824229313142236, + "grad_norm": 3.765625, + "learning_rate": 0.0038863348240029904, + "loss": 2.8142, + "mean_token_accuracy": 0.4642944931983948, + "num_tokens": 7528398248.0, + "step": 14727 + }, + { + "epoch": 3.9826933477555437, + "grad_norm": 2.84375, + "learning_rate": 0.0038853692046369433, + "loss": 2.7417, + "mean_token_accuracy": 0.41697341203689575, + "num_tokens": 7528922496.0, + "step": 14728 + }, + { + "epoch": 3.982963764196863, + "grad_norm": 3.125, + "learning_rate": 0.003884403803568688, + "loss": 2.668, + "mean_token_accuracy": 0.43586283922195435, + "num_tokens": 7529446641.0, + "step": 14729 + }, + { + "epoch": 3.983234180638183, + "grad_norm": 2.703125, + "learning_rate": 0.0038834386208278526, + "loss": 2.7832, + "mean_token_accuracy": 0.44286656379699707, + "num_tokens": 7529967118.0, + "step": 14730 + }, + { + "epoch": 3.983504597079502, + "grad_norm": 14.1875, + "learning_rate": 0.003882473656444049, + "loss": 9.0445, + "mean_token_accuracy": 0.017455555498600006, + "num_tokens": 7530491403.0, + "step": 14731 + }, + { + "epoch": 3.9837750135208223, + "grad_norm": 6.25, + "learning_rate": 0.0038815089104468825, + "loss": 2.8561, + "mean_token_accuracy": 0.4223533868789673, + "num_tokens": 7531012766.0, + "step": 14732 + }, + { + "epoch": 3.9840454299621415, + "grad_norm": 2.46875, + "learning_rate": 0.003880544382865958, + "loss": 2.6988, + "mean_token_accuracy": 0.43162721395492554, + "num_tokens": 7531536998.0, + "step": 14733 + }, + { + "epoch": 3.9843158464034616, + "grad_norm": 2.984375, + "learning_rate": 0.0038795800737308674, + "loss": 2.8396, + "mean_token_accuracy": 0.44216907024383545, + "num_tokens": 7532061184.0, + "step": 14734 + }, + { + "epoch": 3.984586262844781, + "grad_norm": 3.46875, + "learning_rate": 0.003878615983071203, + "loss": 2.8027, + "mean_token_accuracy": 0.4361830949783325, + "num_tokens": 7532585338.0, + "step": 14735 + }, + { + "epoch": 3.9848566792861004, + "grad_norm": 2.5625, + "learning_rate": 0.0038776521109165386, + "loss": 2.806, + "mean_token_accuracy": 0.416343629360199, + "num_tokens": 7533109622.0, + "step": 14736 + }, + { + "epoch": 3.98512709572742, + "grad_norm": 3.484375, + "learning_rate": 0.0038766884572964534, + "loss": 2.7971, + "mean_token_accuracy": 0.44309014081954956, + "num_tokens": 7533593354.0, + "step": 14737 + }, + { + "epoch": 3.9853975121687397, + "grad_norm": 3.078125, + "learning_rate": 0.0038757250222405176, + "loss": 2.6063, + "mean_token_accuracy": 0.4410133361816406, + "num_tokens": 7534117488.0, + "step": 14738 + }, + { + "epoch": 3.9856679286100594, + "grad_norm": 3.125, + "learning_rate": 0.0038747618057782896, + "loss": 2.6825, + "mean_token_accuracy": 0.44364333152770996, + "num_tokens": 7534640594.0, + "step": 14739 + }, + { + "epoch": 3.985938345051379, + "grad_norm": 3.03125, + "learning_rate": 0.0038737988079393204, + "loss": 2.8188, + "mean_token_accuracy": 0.43274742364883423, + "num_tokens": 7535164781.0, + "step": 14740 + }, + { + "epoch": 3.9862087614926986, + "grad_norm": 3.703125, + "learning_rate": 0.0038728360287531628, + "loss": 2.9022, + "mean_token_accuracy": 0.43663927912712097, + "num_tokens": 7535626936.0, + "step": 14741 + }, + { + "epoch": 3.9864791779340183, + "grad_norm": 3.03125, + "learning_rate": 0.003871873468249356, + "loss": 2.9014, + "mean_token_accuracy": 0.422006756067276, + "num_tokens": 7536151163.0, + "step": 14742 + }, + { + "epoch": 3.986749594375338, + "grad_norm": 18.375, + "learning_rate": 0.0038709111264574316, + "loss": 2.419, + "mean_token_accuracy": 0.4926782250404358, + "num_tokens": 7536657529.0, + "step": 14743 + }, + { + "epoch": 3.9870200108166576, + "grad_norm": 5.21875, + "learning_rate": 0.0038699490034069207, + "loss": 3.0823, + "mean_token_accuracy": 0.41330331563949585, + "num_tokens": 7537151571.0, + "step": 14744 + }, + { + "epoch": 3.987290427257977, + "grad_norm": 2.828125, + "learning_rate": 0.0038689870991273426, + "loss": 2.8501, + "mean_token_accuracy": 0.41909027099609375, + "num_tokens": 7537675808.0, + "step": 14745 + }, + { + "epoch": 3.987560843699297, + "grad_norm": 2.65625, + "learning_rate": 0.0038680254136482083, + "loss": 2.8205, + "mean_token_accuracy": 0.4351770877838135, + "num_tokens": 7538200033.0, + "step": 14746 + }, + { + "epoch": 3.9878312601406165, + "grad_norm": 3.484375, + "learning_rate": 0.003867063946999031, + "loss": 2.7961, + "mean_token_accuracy": 0.43191760778427124, + "num_tokens": 7538724134.0, + "step": 14747 + }, + { + "epoch": 3.988101676581936, + "grad_norm": 3.5625, + "learning_rate": 0.0038661026992093095, + "loss": 2.7469, + "mean_token_accuracy": 0.4423729181289673, + "num_tokens": 7539248302.0, + "step": 14748 + }, + { + "epoch": 3.988372093023256, + "grad_norm": 3.546875, + "learning_rate": 0.0038651416703085325, + "loss": 2.9819, + "mean_token_accuracy": 0.4625246822834015, + "num_tokens": 7539709367.0, + "step": 14749 + }, + { + "epoch": 3.9886425094645754, + "grad_norm": 3.546875, + "learning_rate": 0.003864180860326192, + "loss": 2.67, + "mean_token_accuracy": 0.4292086362838745, + "num_tokens": 7540223201.0, + "step": 14750 + }, + { + "epoch": 3.988912925905895, + "grad_norm": 24.375, + "learning_rate": 0.0038632202692917704, + "loss": 9.6459, + "mean_token_accuracy": 0.01704147458076477, + "num_tokens": 7540692448.0, + "step": 14751 + }, + { + "epoch": 3.9891833423472147, + "grad_norm": 6.375, + "learning_rate": 0.0038622598972347356, + "loss": 3.0626, + "mean_token_accuracy": 0.4193859398365021, + "num_tokens": 7541172662.0, + "step": 14752 + }, + { + "epoch": 3.9894537587885344, + "grad_norm": 2.265625, + "learning_rate": 0.003861299744184562, + "loss": 2.7822, + "mean_token_accuracy": 0.434085875749588, + "num_tokens": 7541696861.0, + "step": 14753 + }, + { + "epoch": 3.989724175229854, + "grad_norm": 2.625, + "learning_rate": 0.0038603398101707044, + "loss": 2.5924, + "mean_token_accuracy": 0.4592446982860565, + "num_tokens": 7542211197.0, + "step": 14754 + }, + { + "epoch": 3.9899945916711737, + "grad_norm": 3.34375, + "learning_rate": 0.003859380095222616, + "loss": 2.9599, + "mean_token_accuracy": 0.42067503929138184, + "num_tokens": 7542735335.0, + "step": 14755 + }, + { + "epoch": 3.9902650081124933, + "grad_norm": 2.984375, + "learning_rate": 0.0038584205993697465, + "loss": 2.8591, + "mean_token_accuracy": 0.4626823663711548, + "num_tokens": 7543195898.0, + "step": 14756 + }, + { + "epoch": 3.990535424553813, + "grad_norm": 3.734375, + "learning_rate": 0.0038574613226415367, + "loss": 2.8962, + "mean_token_accuracy": 0.4258740246295929, + "num_tokens": 7543720059.0, + "step": 14757 + }, + { + "epoch": 3.9908058409951326, + "grad_norm": 3.0, + "learning_rate": 0.003856502265067416, + "loss": 2.7064, + "mean_token_accuracy": 0.43637216091156006, + "num_tokens": 7544244171.0, + "step": 14758 + }, + { + "epoch": 3.9910762574364522, + "grad_norm": 3.359375, + "learning_rate": 0.003855543426676814, + "loss": 2.7004, + "mean_token_accuracy": 0.45182138681411743, + "num_tokens": 7544739999.0, + "step": 14759 + }, + { + "epoch": 3.991346673877772, + "grad_norm": 2.84375, + "learning_rate": 0.0038545848074991495, + "loss": 2.9429, + "mean_token_accuracy": 0.41602978110313416, + "num_tokens": 7545264155.0, + "step": 14760 + }, + { + "epoch": 3.9916170903190915, + "grad_norm": 3.09375, + "learning_rate": 0.0038536264075638388, + "loss": 2.7185, + "mean_token_accuracy": 0.45571380853652954, + "num_tokens": 7545728913.0, + "step": 14761 + }, + { + "epoch": 3.991887506760411, + "grad_norm": 3.28125, + "learning_rate": 0.0038526682269002855, + "loss": 2.7488, + "mean_token_accuracy": 0.4356808364391327, + "num_tokens": 7546253131.0, + "step": 14762 + }, + { + "epoch": 3.992157923201731, + "grad_norm": 4.03125, + "learning_rate": 0.0038517102655378876, + "loss": 2.7758, + "mean_token_accuracy": 0.44546592235565186, + "num_tokens": 7546777329.0, + "step": 14763 + }, + { + "epoch": 3.9924283396430504, + "grad_norm": 2.828125, + "learning_rate": 0.00385075252350604, + "loss": 2.778, + "mean_token_accuracy": 0.4479013979434967, + "num_tokens": 7547301494.0, + "step": 14764 + }, + { + "epoch": 3.99269875608437, + "grad_norm": 3.34375, + "learning_rate": 0.003849795000834132, + "loss": 2.6848, + "mean_token_accuracy": 0.4377024471759796, + "num_tokens": 7547806925.0, + "step": 14765 + }, + { + "epoch": 3.9929691725256897, + "grad_norm": 3.109375, + "learning_rate": 0.0038488376975515395, + "loss": 2.8004, + "mean_token_accuracy": 0.42048579454421997, + "num_tokens": 7548331198.0, + "step": 14766 + }, + { + "epoch": 3.9932395889670094, + "grad_norm": 3.65625, + "learning_rate": 0.0038478806136876386, + "loss": 2.8322, + "mean_token_accuracy": 0.41802966594696045, + "num_tokens": 7548855383.0, + "step": 14767 + }, + { + "epoch": 3.9935100054083286, + "grad_norm": 4.625, + "learning_rate": 0.0038469237492717925, + "loss": 2.9471, + "mean_token_accuracy": 0.41883203387260437, + "num_tokens": 7549349014.0, + "step": 14768 + }, + { + "epoch": 3.9937804218496487, + "grad_norm": 3.640625, + "learning_rate": 0.00384596710433336, + "loss": 2.6245, + "mean_token_accuracy": 0.4622320532798767, + "num_tokens": 7549810586.0, + "step": 14769 + }, + { + "epoch": 3.994050838290968, + "grad_norm": 3.765625, + "learning_rate": 0.003845010678901698, + "loss": 2.7656, + "mean_token_accuracy": 0.44143086671829224, + "num_tokens": 7550312518.0, + "step": 14770 + }, + { + "epoch": 3.994321254732288, + "grad_norm": 19.875, + "learning_rate": 0.003844054473006149, + "loss": 9.7422, + "mean_token_accuracy": 0.02660490944981575, + "num_tokens": 7550767834.0, + "step": 14771 + }, + { + "epoch": 3.994591671173607, + "grad_norm": 7.21875, + "learning_rate": 0.003843098486676051, + "loss": 3.0622, + "mean_token_accuracy": 0.3935167193412781, + "num_tokens": 7551291993.0, + "step": 14772 + }, + { + "epoch": 3.9948620876149272, + "grad_norm": 2.171875, + "learning_rate": 0.0038421427199407422, + "loss": 2.7467, + "mean_token_accuracy": 0.4494778513908386, + "num_tokens": 7551774172.0, + "step": 14773 + }, + { + "epoch": 3.9951325040562464, + "grad_norm": 3.546875, + "learning_rate": 0.003841187172829542, + "loss": 2.7637, + "mean_token_accuracy": 0.42937344312667847, + "num_tokens": 7552298450.0, + "step": 14774 + }, + { + "epoch": 3.9954029204975665, + "grad_norm": 2.828125, + "learning_rate": 0.003840231845371772, + "loss": 2.9106, + "mean_token_accuracy": 0.39104342460632324, + "num_tokens": 7552822550.0, + "step": 14775 + }, + { + "epoch": 3.9956733369388857, + "grad_norm": 3.09375, + "learning_rate": 0.003839276737596746, + "loss": 2.6502, + "mean_token_accuracy": 0.4494906961917877, + "num_tokens": 7553346638.0, + "step": 14776 + }, + { + "epoch": 3.9959437533802054, + "grad_norm": 3.484375, + "learning_rate": 0.0038383218495337654, + "loss": 2.8603, + "mean_token_accuracy": 0.4293835759162903, + "num_tokens": 7553870841.0, + "step": 14777 + }, + { + "epoch": 3.996214169821525, + "grad_norm": 3.890625, + "learning_rate": 0.003837367181212134, + "loss": 3.02, + "mean_token_accuracy": 0.43581610918045044, + "num_tokens": 7554340092.0, + "step": 14778 + }, + { + "epoch": 3.9964845862628446, + "grad_norm": 3.328125, + "learning_rate": 0.0038364127326611388, + "loss": 2.8515, + "mean_token_accuracy": 0.42944252490997314, + "num_tokens": 7554864333.0, + "step": 14779 + }, + { + "epoch": 3.9967550027041643, + "grad_norm": 3.375, + "learning_rate": 0.0038354585039100716, + "loss": 2.8838, + "mean_token_accuracy": 0.40356048941612244, + "num_tokens": 7555388531.0, + "step": 14780 + }, + { + "epoch": 3.997025419145484, + "grad_norm": 3.390625, + "learning_rate": 0.0038345044949882026, + "loss": 2.7845, + "mean_token_accuracy": 0.42626717686653137, + "num_tokens": 7555912725.0, + "step": 14781 + }, + { + "epoch": 3.9972958355868036, + "grad_norm": 2.890625, + "learning_rate": 0.0038335507059248115, + "loss": 2.8434, + "mean_token_accuracy": 0.42979753017425537, + "num_tokens": 7556436921.0, + "step": 14782 + }, + { + "epoch": 3.997566252028123, + "grad_norm": 3.703125, + "learning_rate": 0.0038325971367491577, + "loss": 3.0005, + "mean_token_accuracy": 0.4057287871837616, + "num_tokens": 7556961192.0, + "step": 14783 + }, + { + "epoch": 3.997836668469443, + "grad_norm": 3.375, + "learning_rate": 0.0038316437874905034, + "loss": 2.7511, + "mean_token_accuracy": 0.424444317817688, + "num_tokens": 7557485368.0, + "step": 14784 + }, + { + "epoch": 3.9981070849107625, + "grad_norm": 3.078125, + "learning_rate": 0.0038306906581780987, + "loss": 2.9547, + "mean_token_accuracy": 0.4443877339363098, + "num_tokens": 7557966523.0, + "step": 14785 + }, + { + "epoch": 3.998377501352082, + "grad_norm": 4.0, + "learning_rate": 0.0038297377488411867, + "loss": 2.9046, + "mean_token_accuracy": 0.43991321325302124, + "num_tokens": 7558490716.0, + "step": 14786 + }, + { + "epoch": 3.998647917793402, + "grad_norm": 3.3125, + "learning_rate": 0.003828785059509008, + "loss": 2.8616, + "mean_token_accuracy": 0.440001517534256, + "num_tokens": 7559014921.0, + "step": 14787 + }, + { + "epoch": 3.9989183342347214, + "grad_norm": 3.609375, + "learning_rate": 0.0038278325902107936, + "loss": 2.574, + "mean_token_accuracy": 0.44860315322875977, + "num_tokens": 7559539033.0, + "step": 14788 + }, + { + "epoch": 3.999188750676041, + "grad_norm": 3.125, + "learning_rate": 0.0038268803409757645, + "loss": 2.8181, + "mean_token_accuracy": 0.43971699476242065, + "num_tokens": 7560007864.0, + "step": 14789 + }, + { + "epoch": 3.9994591671173607, + "grad_norm": 3.8125, + "learning_rate": 0.003825928311833143, + "loss": 2.5538, + "mean_token_accuracy": 0.454217791557312, + "num_tokens": 7560532057.0, + "step": 14790 + }, + { + "epoch": 3.9997295835586804, + "grad_norm": 18.5, + "learning_rate": 0.0038249765028121386, + "loss": 10.5408, + "mean_token_accuracy": 0.0002503998694010079, + "num_tokens": 7561035901.0, + "step": 14791 + }, + { + "epoch": 4.0, + "grad_norm": 7.8125, + "learning_rate": 0.003824024913941956, + "loss": 2.8353, + "mean_token_accuracy": 0.43082886934280396, + "num_tokens": 7561297916.0, + "step": 14792 + }, + { + "epoch": 4.000270416441319, + "grad_norm": 2.328125, + "learning_rate": 0.0038230735452517896, + "loss": 2.8623, + "mean_token_accuracy": 0.42663466930389404, + "num_tokens": 7561822112.0, + "step": 14793 + }, + { + "epoch": 4.000540832882639, + "grad_norm": 2.625, + "learning_rate": 0.0038221223967708357, + "loss": 2.7306, + "mean_token_accuracy": 0.45033323764801025, + "num_tokens": 7562346308.0, + "step": 14794 + }, + { + "epoch": 4.0008112493239585, + "grad_norm": 2.546875, + "learning_rate": 0.003821171468528273, + "loss": 2.7441, + "mean_token_accuracy": 0.43404191732406616, + "num_tokens": 7562870558.0, + "step": 14795 + }, + { + "epoch": 4.001081665765279, + "grad_norm": 2.78125, + "learning_rate": 0.0038202207605532836, + "loss": 2.6925, + "mean_token_accuracy": 0.4394228756427765, + "num_tokens": 7563394837.0, + "step": 14796 + }, + { + "epoch": 4.001352082206598, + "grad_norm": 3.265625, + "learning_rate": 0.003819270272875036, + "loss": 2.7862, + "mean_token_accuracy": 0.44423654675483704, + "num_tokens": 7563858539.0, + "step": 14797 + }, + { + "epoch": 4.001622498647918, + "grad_norm": 3.15625, + "learning_rate": 0.0038183200055226904, + "loss": 2.7328, + "mean_token_accuracy": 0.4614216089248657, + "num_tokens": 7564382710.0, + "step": 14798 + }, + { + "epoch": 4.001892915089237, + "grad_norm": 2.75, + "learning_rate": 0.003817369958525409, + "loss": 2.9974, + "mean_token_accuracy": 0.3941917419433594, + "num_tokens": 7564906843.0, + "step": 14799 + }, + { + "epoch": 4.002163331530557, + "grad_norm": 3.265625, + "learning_rate": 0.0038164201319123413, + "loss": 2.8089, + "mean_token_accuracy": 0.42841780185699463, + "num_tokens": 7565394751.0, + "step": 14800 + }, + { + "epoch": 4.002433747971876, + "grad_norm": 2.921875, + "learning_rate": 0.0038154705257126267, + "loss": 2.7655, + "mean_token_accuracy": 0.4336967170238495, + "num_tokens": 7565919025.0, + "step": 14801 + }, + { + "epoch": 4.002704164413196, + "grad_norm": 2.875, + "learning_rate": 0.003814521139955407, + "loss": 2.9435, + "mean_token_accuracy": 0.4036332964897156, + "num_tokens": 7566443243.0, + "step": 14802 + }, + { + "epoch": 4.002974580854516, + "grad_norm": 3.234375, + "learning_rate": 0.0038135719746698092, + "loss": 2.9119, + "mean_token_accuracy": 0.42206305265426636, + "num_tokens": 7566967510.0, + "step": 14803 + }, + { + "epoch": 4.003244997295836, + "grad_norm": 3.140625, + "learning_rate": 0.003812623029884955, + "loss": 2.7062, + "mean_token_accuracy": 0.4338746964931488, + "num_tokens": 7567491636.0, + "step": 14804 + }, + { + "epoch": 4.003515413737155, + "grad_norm": 13.4375, + "learning_rate": 0.0038116743056299664, + "loss": 2.7699, + "mean_token_accuracy": 0.43608003854751587, + "num_tokens": 7568015825.0, + "step": 14805 + }, + { + "epoch": 4.003785830178475, + "grad_norm": 3.25, + "learning_rate": 0.0038107258019339464, + "loss": 2.8752, + "mean_token_accuracy": 0.43373286724090576, + "num_tokens": 7568540087.0, + "step": 14806 + }, + { + "epoch": 4.004056246619794, + "grad_norm": 3.015625, + "learning_rate": 0.0038097775188260007, + "loss": 2.9143, + "mean_token_accuracy": 0.4283335208892822, + "num_tokens": 7569019794.0, + "step": 14807 + }, + { + "epoch": 4.004326663061114, + "grad_norm": 3.90625, + "learning_rate": 0.003808829456335229, + "loss": 2.931, + "mean_token_accuracy": 0.43861308693885803, + "num_tokens": 7569543947.0, + "step": 14808 + }, + { + "epoch": 4.0045970795024335, + "grad_norm": 4.75, + "learning_rate": 0.0038078816144907148, + "loss": 3.0035, + "mean_token_accuracy": 0.416536420583725, + "num_tokens": 7570023389.0, + "step": 14809 + }, + { + "epoch": 4.004867495943754, + "grad_norm": 3.078125, + "learning_rate": 0.0038069339933215446, + "loss": 2.7155, + "mean_token_accuracy": 0.4561706483364105, + "num_tokens": 7570500841.0, + "step": 14810 + }, + { + "epoch": 4.005137912385073, + "grad_norm": 14.375, + "learning_rate": 0.003805986592856795, + "loss": 8.6842, + "mean_token_accuracy": 0.012479451484978199, + "num_tokens": 7571025041.0, + "step": 14811 + }, + { + "epoch": 4.005408328826393, + "grad_norm": 6.59375, + "learning_rate": 0.003805039413125529, + "loss": 2.7044, + "mean_token_accuracy": 0.45745259523391724, + "num_tokens": 7571549266.0, + "step": 14812 + }, + { + "epoch": 4.005678745267712, + "grad_norm": 2.1875, + "learning_rate": 0.0038040924541568157, + "loss": 2.9654, + "mean_token_accuracy": 0.42309805750846863, + "num_tokens": 7572073396.0, + "step": 14813 + }, + { + "epoch": 4.005949161709032, + "grad_norm": 3.0625, + "learning_rate": 0.003803145715979707, + "loss": 2.9033, + "mean_token_accuracy": 0.4318380355834961, + "num_tokens": 7572597571.0, + "step": 14814 + }, + { + "epoch": 4.006219578150351, + "grad_norm": 4.0, + "learning_rate": 0.0038021991986232507, + "loss": 2.7945, + "mean_token_accuracy": 0.43726518750190735, + "num_tokens": 7573121790.0, + "step": 14815 + }, + { + "epoch": 4.006489994591671, + "grad_norm": 3.453125, + "learning_rate": 0.003801252902116492, + "loss": 3.0184, + "mean_token_accuracy": 0.4352673888206482, + "num_tokens": 7573591285.0, + "step": 14816 + }, + { + "epoch": 4.006760411032991, + "grad_norm": 3.828125, + "learning_rate": 0.0038003068264884656, + "loss": 2.7691, + "mean_token_accuracy": 0.4421844482421875, + "num_tokens": 7574115314.0, + "step": 14817 + }, + { + "epoch": 4.007030827474311, + "grad_norm": 3.1875, + "learning_rate": 0.0037993609717681953, + "loss": 2.7893, + "mean_token_accuracy": 0.45243701338768005, + "num_tokens": 7574639526.0, + "step": 14818 + }, + { + "epoch": 4.00730124391563, + "grad_norm": 4.4375, + "learning_rate": 0.0037984153379847087, + "loss": 3.0957, + "mean_token_accuracy": 0.4250059723854065, + "num_tokens": 7575163720.0, + "step": 14819 + }, + { + "epoch": 4.00757166035695, + "grad_norm": 3.53125, + "learning_rate": 0.0037974699251670163, + "loss": 2.6875, + "mean_token_accuracy": 0.43325090408325195, + "num_tokens": 7575643468.0, + "step": 14820 + }, + { + "epoch": 4.007842076798269, + "grad_norm": 3.15625, + "learning_rate": 0.00379652473334413, + "loss": 2.7964, + "mean_token_accuracy": 0.43723854422569275, + "num_tokens": 7576167726.0, + "step": 14821 + }, + { + "epoch": 4.008112493239589, + "grad_norm": 3.640625, + "learning_rate": 0.003795579762545045, + "loss": 2.6977, + "mean_token_accuracy": 0.4127313494682312, + "num_tokens": 7576691976.0, + "step": 14822 + }, + { + "epoch": 4.0083829096809085, + "grad_norm": 3.328125, + "learning_rate": 0.003794635012798764, + "loss": 2.7025, + "mean_token_accuracy": 0.44917163252830505, + "num_tokens": 7577216135.0, + "step": 14823 + }, + { + "epoch": 4.008653326122229, + "grad_norm": 3.734375, + "learning_rate": 0.0037936904841342667, + "loss": 2.5705, + "mean_token_accuracy": 0.4438689947128296, + "num_tokens": 7577740237.0, + "step": 14824 + }, + { + "epoch": 4.008923742563548, + "grad_norm": 3.109375, + "learning_rate": 0.0037927461765805415, + "loss": 2.893, + "mean_token_accuracy": 0.4297787845134735, + "num_tokens": 7578264483.0, + "step": 14825 + }, + { + "epoch": 4.009194159004868, + "grad_norm": 3.59375, + "learning_rate": 0.0037918020901665577, + "loss": 2.5842, + "mean_token_accuracy": 0.4589381217956543, + "num_tokens": 7578788739.0, + "step": 14826 + }, + { + "epoch": 4.009464575446187, + "grad_norm": 3.40625, + "learning_rate": 0.003790858224921282, + "loss": 2.6196, + "mean_token_accuracy": 0.4355798363685608, + "num_tokens": 7579313019.0, + "step": 14827 + }, + { + "epoch": 4.009734991887507, + "grad_norm": 3.125, + "learning_rate": 0.003789914580873678, + "loss": 2.9434, + "mean_token_accuracy": 0.44182878732681274, + "num_tokens": 7579835132.0, + "step": 14828 + }, + { + "epoch": 4.010005408328826, + "grad_norm": 3.46875, + "learning_rate": 0.0037889711580526963, + "loss": 2.7984, + "mean_token_accuracy": 0.4339786767959595, + "num_tokens": 7580326552.0, + "step": 14829 + }, + { + "epoch": 4.010275824770146, + "grad_norm": 3.75, + "learning_rate": 0.0037880279564872885, + "loss": 2.7126, + "mean_token_accuracy": 0.4468153417110443, + "num_tokens": 7580850622.0, + "step": 14830 + }, + { + "epoch": 4.010546241211466, + "grad_norm": 20.125, + "learning_rate": 0.003787084976206392, + "loss": 9.3058, + "mean_token_accuracy": 0.03538628667593002, + "num_tokens": 7581374888.0, + "step": 14831 + }, + { + "epoch": 4.010816657652786, + "grad_norm": 8.25, + "learning_rate": 0.003786142217238937, + "loss": 2.9858, + "mean_token_accuracy": 0.41803959012031555, + "num_tokens": 7581898996.0, + "step": 14832 + }, + { + "epoch": 4.011087074094105, + "grad_norm": 2.53125, + "learning_rate": 0.0037851996796138565, + "loss": 3.0608, + "mean_token_accuracy": 0.43686643242836, + "num_tokens": 7582362665.0, + "step": 14833 + }, + { + "epoch": 4.011357490535424, + "grad_norm": 3.203125, + "learning_rate": 0.003784257363360064, + "loss": 2.7901, + "mean_token_accuracy": 0.43702322244644165, + "num_tokens": 7582850142.0, + "step": 14834 + }, + { + "epoch": 4.011627906976744, + "grad_norm": 3.234375, + "learning_rate": 0.00378331526850648, + "loss": 2.8559, + "mean_token_accuracy": 0.4255307614803314, + "num_tokens": 7583374299.0, + "step": 14835 + }, + { + "epoch": 4.011898323418063, + "grad_norm": 3.0625, + "learning_rate": 0.0037823733950820017, + "loss": 2.7306, + "mean_token_accuracy": 0.4513099491596222, + "num_tokens": 7583854212.0, + "step": 14836 + }, + { + "epoch": 4.0121687398593835, + "grad_norm": 3.21875, + "learning_rate": 0.003781431743115537, + "loss": 2.9241, + "mean_token_accuracy": 0.4309200644493103, + "num_tokens": 7584378366.0, + "step": 14837 + }, + { + "epoch": 4.012439156300703, + "grad_norm": 2.984375, + "learning_rate": 0.0037804903126359723, + "loss": 2.7106, + "mean_token_accuracy": 0.4278486967086792, + "num_tokens": 7584902651.0, + "step": 14838 + }, + { + "epoch": 4.012709572742023, + "grad_norm": 3.765625, + "learning_rate": 0.0037795491036721985, + "loss": 2.7631, + "mean_token_accuracy": 0.4426266551017761, + "num_tokens": 7585426684.0, + "step": 14839 + }, + { + "epoch": 4.012979989183342, + "grad_norm": 3.3125, + "learning_rate": 0.0037786081162530924, + "loss": 2.7262, + "mean_token_accuracy": 0.47280240058898926, + "num_tokens": 7585950873.0, + "step": 14840 + }, + { + "epoch": 4.013250405624662, + "grad_norm": 3.4375, + "learning_rate": 0.003777667350407523, + "loss": 2.7979, + "mean_token_accuracy": 0.4400542080402374, + "num_tokens": 7586432262.0, + "step": 14841 + }, + { + "epoch": 4.013520822065981, + "grad_norm": 3.71875, + "learning_rate": 0.003776726806164361, + "loss": 2.811, + "mean_token_accuracy": 0.42069464921951294, + "num_tokens": 7586956369.0, + "step": 14842 + }, + { + "epoch": 4.013791238507301, + "grad_norm": 3.15625, + "learning_rate": 0.003775786483552463, + "loss": 2.8779, + "mean_token_accuracy": 0.4470442235469818, + "num_tokens": 7587480639.0, + "step": 14843 + }, + { + "epoch": 4.0140616549486205, + "grad_norm": 3.796875, + "learning_rate": 0.0037748463826006787, + "loss": 2.974, + "mean_token_accuracy": 0.41935935616493225, + "num_tokens": 7588004859.0, + "step": 14844 + }, + { + "epoch": 4.014332071389941, + "grad_norm": 3.59375, + "learning_rate": 0.003773906503337857, + "loss": 3.0076, + "mean_token_accuracy": 0.4526205062866211, + "num_tokens": 7588463818.0, + "step": 14845 + }, + { + "epoch": 4.01460248783126, + "grad_norm": 4.4375, + "learning_rate": 0.003772966845792835, + "loss": 3.1069, + "mean_token_accuracy": 0.42543959617614746, + "num_tokens": 7588900135.0, + "step": 14846 + }, + { + "epoch": 4.01487290427258, + "grad_norm": 4.4375, + "learning_rate": 0.003772027409994442, + "loss": 2.7724, + "mean_token_accuracy": 0.4334099590778351, + "num_tokens": 7589424389.0, + "step": 14847 + }, + { + "epoch": 4.015143320713899, + "grad_norm": 3.328125, + "learning_rate": 0.0037710881959715048, + "loss": 2.9295, + "mean_token_accuracy": 0.4515954852104187, + "num_tokens": 7589948543.0, + "step": 14848 + }, + { + "epoch": 4.015413737155219, + "grad_norm": 4.59375, + "learning_rate": 0.003770149203752838, + "loss": 3.004, + "mean_token_accuracy": 0.43166232109069824, + "num_tokens": 7590472788.0, + "step": 14849 + }, + { + "epoch": 4.015684153596538, + "grad_norm": 3.46875, + "learning_rate": 0.003769210433367256, + "loss": 2.9054, + "mean_token_accuracy": 0.42106759548187256, + "num_tokens": 7590997051.0, + "step": 14850 + }, + { + "epoch": 4.0159545700378585, + "grad_norm": 39.75, + "learning_rate": 0.003768271884843565, + "loss": 9.9969, + "mean_token_accuracy": 0.015858124941587448, + "num_tokens": 7591461996.0, + "step": 14851 + }, + { + "epoch": 4.016224986479178, + "grad_norm": 8.0625, + "learning_rate": 0.003767333558210557, + "loss": 2.9667, + "mean_token_accuracy": 0.4224206805229187, + "num_tokens": 7591986270.0, + "step": 14852 + }, + { + "epoch": 4.016495402920498, + "grad_norm": 2.421875, + "learning_rate": 0.0037663954534970273, + "loss": 2.7745, + "mean_token_accuracy": 0.4425310790538788, + "num_tokens": 7592510498.0, + "step": 14853 + }, + { + "epoch": 4.016765819361817, + "grad_norm": 3.5625, + "learning_rate": 0.0037654575707317586, + "loss": 3.0925, + "mean_token_accuracy": 0.4047682285308838, + "num_tokens": 7593034751.0, + "step": 14854 + }, + { + "epoch": 4.017036235803137, + "grad_norm": 2.734375, + "learning_rate": 0.003764519909943524, + "loss": 2.9783, + "mean_token_accuracy": 0.43607279658317566, + "num_tokens": 7593516981.0, + "step": 14855 + }, + { + "epoch": 4.017306652244456, + "grad_norm": 3.546875, + "learning_rate": 0.003763582471161099, + "loss": 2.7811, + "mean_token_accuracy": 0.4499276280403137, + "num_tokens": 7594036756.0, + "step": 14856 + }, + { + "epoch": 4.017577068685776, + "grad_norm": 3.296875, + "learning_rate": 0.003762645254413245, + "loss": 2.8398, + "mean_token_accuracy": 0.42307668924331665, + "num_tokens": 7594560941.0, + "step": 14857 + }, + { + "epoch": 4.0178474851270956, + "grad_norm": 3.421875, + "learning_rate": 0.0037617082597287154, + "loss": 2.8521, + "mean_token_accuracy": 0.4446612596511841, + "num_tokens": 7595085074.0, + "step": 14858 + }, + { + "epoch": 4.018117901568416, + "grad_norm": 2.765625, + "learning_rate": 0.003760771487136266, + "loss": 2.8081, + "mean_token_accuracy": 0.4362233281135559, + "num_tokens": 7595609347.0, + "step": 14859 + }, + { + "epoch": 4.018388318009735, + "grad_norm": 3.609375, + "learning_rate": 0.0037598349366646366, + "loss": 3.1188, + "mean_token_accuracy": 0.411917120218277, + "num_tokens": 7596133596.0, + "step": 14860 + }, + { + "epoch": 4.018658734451055, + "grad_norm": 3.609375, + "learning_rate": 0.0037588986083425605, + "loss": 2.8971, + "mean_token_accuracy": 0.4318092465400696, + "num_tokens": 7596657708.0, + "step": 14861 + }, + { + "epoch": 4.018929150892374, + "grad_norm": 3.34375, + "learning_rate": 0.0037579625021987723, + "loss": 2.7698, + "mean_token_accuracy": 0.4268577992916107, + "num_tokens": 7597181980.0, + "step": 14862 + }, + { + "epoch": 4.019199567333694, + "grad_norm": 3.125, + "learning_rate": 0.0037570266182619895, + "loss": 2.8333, + "mean_token_accuracy": 0.4504179358482361, + "num_tokens": 7597662735.0, + "step": 14863 + }, + { + "epoch": 4.019469983775013, + "grad_norm": 2.640625, + "learning_rate": 0.0037560909565609304, + "loss": 2.5443, + "mean_token_accuracy": 0.4646497070789337, + "num_tokens": 7598160726.0, + "step": 14864 + }, + { + "epoch": 4.0197404002163335, + "grad_norm": 3.21875, + "learning_rate": 0.0037551555171243052, + "loss": 2.7182, + "mean_token_accuracy": 0.4389584958553314, + "num_tokens": 7598640307.0, + "step": 14865 + }, + { + "epoch": 4.020010816657653, + "grad_norm": 3.015625, + "learning_rate": 0.003754220299980816, + "loss": 2.8043, + "mean_token_accuracy": 0.4598906338214874, + "num_tokens": 7599100687.0, + "step": 14866 + }, + { + "epoch": 4.020281233098973, + "grad_norm": 3.21875, + "learning_rate": 0.0037532853051591526, + "loss": 2.8552, + "mean_token_accuracy": 0.43046626448631287, + "num_tokens": 7599580931.0, + "step": 14867 + }, + { + "epoch": 4.020551649540292, + "grad_norm": 2.890625, + "learning_rate": 0.0037523505326880102, + "loss": 2.6198, + "mean_token_accuracy": 0.4367694854736328, + "num_tokens": 7600105077.0, + "step": 14868 + }, + { + "epoch": 4.020822065981612, + "grad_norm": 3.296875, + "learning_rate": 0.003751415982596068, + "loss": 2.7602, + "mean_token_accuracy": 0.4368782043457031, + "num_tokens": 7600629344.0, + "step": 14869 + }, + { + "epoch": 4.021092482422931, + "grad_norm": 3.625, + "learning_rate": 0.003750481654911997, + "loss": 2.7656, + "mean_token_accuracy": 0.441817969083786, + "num_tokens": 7601142989.0, + "step": 14870 + }, + { + "epoch": 4.021362898864251, + "grad_norm": 32.0, + "learning_rate": 0.0037495475496644704, + "loss": 9.2998, + "mean_token_accuracy": 0.0018544497434049845, + "num_tokens": 7601667090.0, + "step": 14871 + }, + { + "epoch": 4.0216333153055706, + "grad_norm": 7.875, + "learning_rate": 0.003748613666882148, + "loss": 2.8208, + "mean_token_accuracy": 0.4182206988334656, + "num_tokens": 7602191343.0, + "step": 14872 + }, + { + "epoch": 4.021903731746891, + "grad_norm": 2.1875, + "learning_rate": 0.0037476800065936802, + "loss": 2.6692, + "mean_token_accuracy": 0.43751657009124756, + "num_tokens": 7602715545.0, + "step": 14873 + }, + { + "epoch": 4.02217414818821, + "grad_norm": 3.234375, + "learning_rate": 0.00374674656882772, + "loss": 2.9796, + "mean_token_accuracy": 0.43192222714424133, + "num_tokens": 7603239791.0, + "step": 14874 + }, + { + "epoch": 4.022444564629529, + "grad_norm": 2.953125, + "learning_rate": 0.0037458133536129035, + "loss": 2.8307, + "mean_token_accuracy": 0.41685009002685547, + "num_tokens": 7603763961.0, + "step": 14875 + }, + { + "epoch": 4.022714981070849, + "grad_norm": 3.109375, + "learning_rate": 0.003744880360977869, + "loss": 2.5095, + "mean_token_accuracy": 0.4527401328086853, + "num_tokens": 7604288152.0, + "step": 14876 + }, + { + "epoch": 4.022985397512168, + "grad_norm": 2.84375, + "learning_rate": 0.0037439475909512377, + "loss": 2.7329, + "mean_token_accuracy": 0.4295801520347595, + "num_tokens": 7604812398.0, + "step": 14877 + }, + { + "epoch": 4.023255813953488, + "grad_norm": 2.921875, + "learning_rate": 0.0037430150435616358, + "loss": 2.7613, + "mean_token_accuracy": 0.4229980707168579, + "num_tokens": 7605336552.0, + "step": 14878 + }, + { + "epoch": 4.023526230394808, + "grad_norm": 9.875, + "learning_rate": 0.003742082718837672, + "loss": 2.8487, + "mean_token_accuracy": 0.46692389249801636, + "num_tokens": 7605860701.0, + "step": 14879 + }, + { + "epoch": 4.023796646836128, + "grad_norm": 2.9375, + "learning_rate": 0.003741150616807956, + "loss": 2.7418, + "mean_token_accuracy": 0.44007936120033264, + "num_tokens": 7606384813.0, + "step": 14880 + }, + { + "epoch": 4.024067063277447, + "grad_norm": 3.4375, + "learning_rate": 0.0037402187375010832, + "loss": 2.879, + "mean_token_accuracy": 0.427669882774353, + "num_tokens": 7606908975.0, + "step": 14881 + }, + { + "epoch": 4.024337479718767, + "grad_norm": 4.1875, + "learning_rate": 0.0037392870809456525, + "loss": 3.0385, + "mean_token_accuracy": 0.42459630966186523, + "num_tokens": 7607433087.0, + "step": 14882 + }, + { + "epoch": 4.024607896160086, + "grad_norm": 4.875, + "learning_rate": 0.003738355647170247, + "loss": 2.68, + "mean_token_accuracy": 0.4546039402484894, + "num_tokens": 7607947911.0, + "step": 14883 + }, + { + "epoch": 4.024878312601406, + "grad_norm": 2.796875, + "learning_rate": 0.003737424436203442, + "loss": 2.8827, + "mean_token_accuracy": 0.4315738081932068, + "num_tokens": 7608424133.0, + "step": 14884 + }, + { + "epoch": 4.0251487290427255, + "grad_norm": 3.6875, + "learning_rate": 0.0037364934480738155, + "loss": 2.923, + "mean_token_accuracy": 0.4284348487854004, + "num_tokens": 7608925270.0, + "step": 14885 + }, + { + "epoch": 4.025419145484046, + "grad_norm": 3.3125, + "learning_rate": 0.003735562682809931, + "loss": 2.7239, + "mean_token_accuracy": 0.4440329968929291, + "num_tokens": 7609449277.0, + "step": 14886 + }, + { + "epoch": 4.025689561925365, + "grad_norm": 3.53125, + "learning_rate": 0.0037346321404403437, + "loss": 2.8576, + "mean_token_accuracy": 0.40295273065567017, + "num_tokens": 7609973413.0, + "step": 14887 + }, + { + "epoch": 4.025959978366685, + "grad_norm": 3.21875, + "learning_rate": 0.0037337018209936124, + "loss": 2.8712, + "mean_token_accuracy": 0.4201178252696991, + "num_tokens": 7610497670.0, + "step": 14888 + }, + { + "epoch": 4.026230394808004, + "grad_norm": 2.890625, + "learning_rate": 0.0037327717244982762, + "loss": 2.9071, + "mean_token_accuracy": 0.4379546642303467, + "num_tokens": 7611000062.0, + "step": 14889 + }, + { + "epoch": 4.026500811249324, + "grad_norm": 3.359375, + "learning_rate": 0.003731841850982873, + "loss": 2.9143, + "mean_token_accuracy": 0.43721121549606323, + "num_tokens": 7611472899.0, + "step": 14890 + }, + { + "epoch": 4.026771227690643, + "grad_norm": 9.0, + "learning_rate": 0.0037309122004759365, + "loss": 8.6404, + "mean_token_accuracy": 0.03585674241185188, + "num_tokens": 7611997075.0, + "step": 14891 + }, + { + "epoch": 4.027041644131963, + "grad_norm": 6.59375, + "learning_rate": 0.003729982773005993, + "loss": 2.9262, + "mean_token_accuracy": 0.400753378868103, + "num_tokens": 7612521155.0, + "step": 14892 + }, + { + "epoch": 4.027312060573283, + "grad_norm": 2.625, + "learning_rate": 0.003729053568601556, + "loss": 2.9882, + "mean_token_accuracy": 0.4133892059326172, + "num_tokens": 7613045378.0, + "step": 14893 + }, + { + "epoch": 4.027582477014603, + "grad_norm": 4.15625, + "learning_rate": 0.0037281245872911395, + "loss": 3.0843, + "mean_token_accuracy": 0.41736307740211487, + "num_tokens": 7613518948.0, + "step": 14894 + }, + { + "epoch": 4.027852893455922, + "grad_norm": 3.046875, + "learning_rate": 0.003727195829103246, + "loss": 2.8294, + "mean_token_accuracy": 0.4454216957092285, + "num_tokens": 7614032324.0, + "step": 14895 + }, + { + "epoch": 4.028123309897242, + "grad_norm": 3.65625, + "learning_rate": 0.00372626729406637, + "loss": 2.8237, + "mean_token_accuracy": 0.4309190511703491, + "num_tokens": 7614556484.0, + "step": 14896 + }, + { + "epoch": 4.028393726338561, + "grad_norm": 3.3125, + "learning_rate": 0.0037253389822090054, + "loss": 2.6906, + "mean_token_accuracy": 0.43996870517730713, + "num_tokens": 7615080687.0, + "step": 14897 + }, + { + "epoch": 4.028664142779881, + "grad_norm": 3.453125, + "learning_rate": 0.0037244108935596335, + "loss": 2.8241, + "mean_token_accuracy": 0.42699962854385376, + "num_tokens": 7615604864.0, + "step": 14898 + }, + { + "epoch": 4.0289345592212005, + "grad_norm": 3.453125, + "learning_rate": 0.0037234830281467337, + "loss": 2.7024, + "mean_token_accuracy": 0.42348170280456543, + "num_tokens": 7616129117.0, + "step": 14899 + }, + { + "epoch": 4.029204975662521, + "grad_norm": 3.28125, + "learning_rate": 0.0037225553859987727, + "loss": 2.6649, + "mean_token_accuracy": 0.46476152539253235, + "num_tokens": 7616653380.0, + "step": 14900 + }, + { + "epoch": 4.02947539210384, + "grad_norm": 3.375, + "learning_rate": 0.0037216279671442123, + "loss": 2.796, + "mean_token_accuracy": 0.4542435109615326, + "num_tokens": 7617177623.0, + "step": 14901 + }, + { + "epoch": 4.02974580854516, + "grad_norm": 3.234375, + "learning_rate": 0.0037207007716115125, + "loss": 2.7498, + "mean_token_accuracy": 0.438683420419693, + "num_tokens": 7617664894.0, + "step": 14902 + }, + { + "epoch": 4.030016224986479, + "grad_norm": 3.125, + "learning_rate": 0.0037197737994291193, + "loss": 2.8637, + "mean_token_accuracy": 0.43720000982284546, + "num_tokens": 7618189169.0, + "step": 14903 + }, + { + "epoch": 4.030286641427799, + "grad_norm": 3.390625, + "learning_rate": 0.0037188470506254747, + "loss": 2.9271, + "mean_token_accuracy": 0.43301695585250854, + "num_tokens": 7618684988.0, + "step": 14904 + }, + { + "epoch": 4.030557057869118, + "grad_norm": 3.796875, + "learning_rate": 0.003717920525229016, + "loss": 2.794, + "mean_token_accuracy": 0.43765512108802795, + "num_tokens": 7619154629.0, + "step": 14905 + }, + { + "epoch": 4.030827474310438, + "grad_norm": 4.0625, + "learning_rate": 0.0037169942232681687, + "loss": 2.8681, + "mean_token_accuracy": 0.4409828186035156, + "num_tokens": 7619678814.0, + "step": 14906 + }, + { + "epoch": 4.031097890751758, + "grad_norm": 4.1875, + "learning_rate": 0.0037160681447713567, + "loss": 2.7702, + "mean_token_accuracy": 0.4359978437423706, + "num_tokens": 7620203069.0, + "step": 14907 + }, + { + "epoch": 4.031368307193078, + "grad_norm": 3.890625, + "learning_rate": 0.003715142289766997, + "loss": 2.4993, + "mean_token_accuracy": 0.4885002374649048, + "num_tokens": 7620727242.0, + "step": 14908 + }, + { + "epoch": 4.031638723634397, + "grad_norm": 3.65625, + "learning_rate": 0.0037142166582834956, + "loss": 3.0494, + "mean_token_accuracy": 0.4135349988937378, + "num_tokens": 7621251264.0, + "step": 14909 + }, + { + "epoch": 4.031909140075717, + "grad_norm": 4.21875, + "learning_rate": 0.0037132912503492505, + "loss": 2.9289, + "mean_token_accuracy": 0.4126613140106201, + "num_tokens": 7621775343.0, + "step": 14910 + }, + { + "epoch": 4.032179556517036, + "grad_norm": 302.0, + "learning_rate": 0.003712366065992662, + "loss": 13.0851, + "mean_token_accuracy": 0.018156811594963074, + "num_tokens": 7622275427.0, + "step": 14911 + }, + { + "epoch": 4.032449972958356, + "grad_norm": 7.0, + "learning_rate": 0.003711441105242112, + "loss": 2.3354, + "mean_token_accuracy": 0.5136414170265198, + "num_tokens": 7622799655.0, + "step": 14912 + }, + { + "epoch": 4.0327203893996755, + "grad_norm": 3.15625, + "learning_rate": 0.0037105163681259825, + "loss": 2.953, + "mean_token_accuracy": 0.42365655303001404, + "num_tokens": 7623323831.0, + "step": 14913 + }, + { + "epoch": 4.032990805840996, + "grad_norm": 2.75, + "learning_rate": 0.0037095918546726494, + "loss": 2.9574, + "mean_token_accuracy": 0.42660078406333923, + "num_tokens": 7623848113.0, + "step": 14914 + }, + { + "epoch": 4.033261222282315, + "grad_norm": 3.25, + "learning_rate": 0.0037086675649104783, + "loss": 2.7948, + "mean_token_accuracy": 0.4288884401321411, + "num_tokens": 7624372307.0, + "step": 14915 + }, + { + "epoch": 4.033531638723634, + "grad_norm": 3.203125, + "learning_rate": 0.003707743498867826, + "loss": 2.7338, + "mean_token_accuracy": 0.44142642617225647, + "num_tokens": 7624882415.0, + "step": 14916 + }, + { + "epoch": 4.033802055164954, + "grad_norm": 3.125, + "learning_rate": 0.0037068196565730512, + "loss": 2.8696, + "mean_token_accuracy": 0.43767696619033813, + "num_tokens": 7625402427.0, + "step": 14917 + }, + { + "epoch": 4.034072471606273, + "grad_norm": 3.265625, + "learning_rate": 0.0037058960380544972, + "loss": 2.874, + "mean_token_accuracy": 0.43982046842575073, + "num_tokens": 7625865823.0, + "step": 14918 + }, + { + "epoch": 4.034342888047593, + "grad_norm": 35.0, + "learning_rate": 0.0037049726433405006, + "loss": 2.927, + "mean_token_accuracy": 0.43929654359817505, + "num_tokens": 7626389827.0, + "step": 14919 + }, + { + "epoch": 4.0346133044889125, + "grad_norm": 6.09375, + "learning_rate": 0.0037040494724593963, + "loss": 2.9325, + "mean_token_accuracy": 0.4305141568183899, + "num_tokens": 7626913984.0, + "step": 14920 + }, + { + "epoch": 4.034883720930233, + "grad_norm": 2.8125, + "learning_rate": 0.003703126525439513, + "loss": 2.9904, + "mean_token_accuracy": 0.4119304120540619, + "num_tokens": 7627438164.0, + "step": 14921 + }, + { + "epoch": 4.035154137371552, + "grad_norm": 3.359375, + "learning_rate": 0.0037022038023091642, + "loss": 2.77, + "mean_token_accuracy": 0.4182432293891907, + "num_tokens": 7627962413.0, + "step": 14922 + }, + { + "epoch": 4.035424553812872, + "grad_norm": 2.9375, + "learning_rate": 0.0037012813030966665, + "loss": 2.8461, + "mean_token_accuracy": 0.430863618850708, + "num_tokens": 7628486600.0, + "step": 14923 + }, + { + "epoch": 4.035694970254191, + "grad_norm": 3.8125, + "learning_rate": 0.00370035902783032, + "loss": 2.7049, + "mean_token_accuracy": 0.44317349791526794, + "num_tokens": 7629010815.0, + "step": 14924 + }, + { + "epoch": 4.035965386695511, + "grad_norm": 3.65625, + "learning_rate": 0.0036994369765384273, + "loss": 2.8315, + "mean_token_accuracy": 0.43655896186828613, + "num_tokens": 7629506416.0, + "step": 14925 + }, + { + "epoch": 4.03623580313683, + "grad_norm": 4.53125, + "learning_rate": 0.0036985151492492786, + "loss": 2.8914, + "mean_token_accuracy": 0.4439058005809784, + "num_tokens": 7630030649.0, + "step": 14926 + }, + { + "epoch": 4.0365062195781505, + "grad_norm": 4.375, + "learning_rate": 0.0036975935459911545, + "loss": 3.0043, + "mean_token_accuracy": 0.4151840806007385, + "num_tokens": 7630520144.0, + "step": 14927 + }, + { + "epoch": 4.03677663601947, + "grad_norm": 3.828125, + "learning_rate": 0.0036966721667923376, + "loss": 2.796, + "mean_token_accuracy": 0.4369235336780548, + "num_tokens": 7630997653.0, + "step": 14928 + }, + { + "epoch": 4.03704705246079, + "grad_norm": 4.21875, + "learning_rate": 0.0036957510116810972, + "loss": 2.808, + "mean_token_accuracy": 0.4526623785495758, + "num_tokens": 7631521890.0, + "step": 14929 + }, + { + "epoch": 4.037317468902109, + "grad_norm": 3.9375, + "learning_rate": 0.003694830080685694, + "loss": 2.9424, + "mean_token_accuracy": 0.4440075159072876, + "num_tokens": 7632015853.0, + "step": 14930 + }, + { + "epoch": 4.037587885343429, + "grad_norm": 90.5, + "learning_rate": 0.0036939093738343875, + "loss": 8.336, + "mean_token_accuracy": 0.0662432536482811, + "num_tokens": 7632540099.0, + "step": 14931 + }, + { + "epoch": 4.037858301784748, + "grad_norm": 8.1875, + "learning_rate": 0.003692988891155429, + "loss": 3.0388, + "mean_token_accuracy": 0.4252561926841736, + "num_tokens": 7633064299.0, + "step": 14932 + }, + { + "epoch": 4.038128718226068, + "grad_norm": 2.4375, + "learning_rate": 0.0036920686326770563, + "loss": 2.9378, + "mean_token_accuracy": 0.41669583320617676, + "num_tokens": 7633588425.0, + "step": 14933 + }, + { + "epoch": 4.0383991346673875, + "grad_norm": 3.4375, + "learning_rate": 0.0036911485984275102, + "loss": 2.7955, + "mean_token_accuracy": 0.4381205141544342, + "num_tokens": 7634069496.0, + "step": 14934 + }, + { + "epoch": 4.038669551108708, + "grad_norm": 3.828125, + "learning_rate": 0.0036902287884350207, + "loss": 2.7196, + "mean_token_accuracy": 0.42047011852264404, + "num_tokens": 7634593622.0, + "step": 14935 + }, + { + "epoch": 4.038939967550027, + "grad_norm": 3.9375, + "learning_rate": 0.0036893092027278068, + "loss": 2.5814, + "mean_token_accuracy": 0.4505772292613983, + "num_tokens": 7635117751.0, + "step": 14936 + }, + { + "epoch": 4.039210383991347, + "grad_norm": 3.21875, + "learning_rate": 0.0036883898413340877, + "loss": 2.8005, + "mean_token_accuracy": 0.4345087707042694, + "num_tokens": 7635641972.0, + "step": 14937 + }, + { + "epoch": 4.039480800432666, + "grad_norm": 3.109375, + "learning_rate": 0.003687470704282071, + "loss": 2.9104, + "mean_token_accuracy": 0.42488473653793335, + "num_tokens": 7636166202.0, + "step": 14938 + }, + { + "epoch": 4.039751216873986, + "grad_norm": 3.890625, + "learning_rate": 0.0036865517915999546, + "loss": 2.8994, + "mean_token_accuracy": 0.42472952604293823, + "num_tokens": 7636690423.0, + "step": 14939 + }, + { + "epoch": 4.040021633315305, + "grad_norm": 2.890625, + "learning_rate": 0.00368563310331594, + "loss": 2.5999, + "mean_token_accuracy": 0.47261449694633484, + "num_tokens": 7637151504.0, + "step": 14940 + }, + { + "epoch": 4.0402920497566255, + "grad_norm": 3.375, + "learning_rate": 0.0036847146394582107, + "loss": 2.6948, + "mean_token_accuracy": 0.4306185841560364, + "num_tokens": 7637675645.0, + "step": 14941 + }, + { + "epoch": 4.040562466197945, + "grad_norm": 2.796875, + "learning_rate": 0.003683796400054948, + "loss": 2.8568, + "mean_token_accuracy": 0.43109288811683655, + "num_tokens": 7638199912.0, + "step": 14942 + }, + { + "epoch": 4.040832882639265, + "grad_norm": 3.234375, + "learning_rate": 0.003682878385134328, + "loss": 2.7541, + "mean_token_accuracy": 0.4417250454425812, + "num_tokens": 7638724152.0, + "step": 14943 + }, + { + "epoch": 4.041103299080584, + "grad_norm": 2.890625, + "learning_rate": 0.0036819605947245168, + "loss": 2.8717, + "mean_token_accuracy": 0.41071122884750366, + "num_tokens": 7639248270.0, + "step": 14944 + }, + { + "epoch": 4.041373715521904, + "grad_norm": 3.984375, + "learning_rate": 0.0036810430288536766, + "loss": 2.8685, + "mean_token_accuracy": 0.41769295930862427, + "num_tokens": 7639772439.0, + "step": 14945 + }, + { + "epoch": 4.041644131963223, + "grad_norm": 3.875, + "learning_rate": 0.003680125687549961, + "loss": 2.6949, + "mean_token_accuracy": 0.4523520767688751, + "num_tokens": 7640236784.0, + "step": 14946 + }, + { + "epoch": 4.041914548404543, + "grad_norm": 3.3125, + "learning_rate": 0.0036792085708415136, + "loss": 2.7838, + "mean_token_accuracy": 0.4288772940635681, + "num_tokens": 7640737804.0, + "step": 14947 + }, + { + "epoch": 4.0421849648458625, + "grad_norm": 3.421875, + "learning_rate": 0.003678291678756478, + "loss": 2.9669, + "mean_token_accuracy": 0.4357127547264099, + "num_tokens": 7641226424.0, + "step": 14948 + }, + { + "epoch": 4.042455381287183, + "grad_norm": 3.828125, + "learning_rate": 0.003677375011322982, + "loss": 2.871, + "mean_token_accuracy": 0.4298439025878906, + "num_tokens": 7641750638.0, + "step": 14949 + }, + { + "epoch": 4.042725797728502, + "grad_norm": 2.4375, + "learning_rate": 0.003676458568569156, + "loss": 2.7289, + "mean_token_accuracy": 0.45180970430374146, + "num_tokens": 7642274742.0, + "step": 14950 + }, + { + "epoch": 4.042996214169822, + "grad_norm": 109.0, + "learning_rate": 0.0036755423505231213, + "loss": 11.3491, + "mean_token_accuracy": 5.220883758738637e-05, + "num_tokens": 7642798930.0, + "step": 14951 + }, + { + "epoch": 4.043266630611141, + "grad_norm": 6.125, + "learning_rate": 0.0036746263572129856, + "loss": 2.9008, + "mean_token_accuracy": 0.39955851435661316, + "num_tokens": 7643323194.0, + "step": 14952 + }, + { + "epoch": 4.043537047052461, + "grad_norm": 2.484375, + "learning_rate": 0.0036737105886668554, + "loss": 2.865, + "mean_token_accuracy": 0.43028056621551514, + "num_tokens": 7643847432.0, + "step": 14953 + }, + { + "epoch": 4.04380746349378, + "grad_norm": 3.8125, + "learning_rate": 0.00367279504491283, + "loss": 2.9744, + "mean_token_accuracy": 0.4322505593299866, + "num_tokens": 7644371711.0, + "step": 14954 + }, + { + "epoch": 4.0440778799351005, + "grad_norm": 3.84375, + "learning_rate": 0.0036718797259790016, + "loss": 2.7761, + "mean_token_accuracy": 0.442981481552124, + "num_tokens": 7644895941.0, + "step": 14955 + }, + { + "epoch": 4.04434829637642, + "grad_norm": 2.90625, + "learning_rate": 0.003670964631893451, + "loss": 2.7894, + "mean_token_accuracy": 0.4253121018409729, + "num_tokens": 7645420140.0, + "step": 14956 + }, + { + "epoch": 4.044618712817739, + "grad_norm": 4.25, + "learning_rate": 0.003670049762684262, + "loss": 2.5357, + "mean_token_accuracy": 0.45421910285949707, + "num_tokens": 7645892986.0, + "step": 14957 + }, + { + "epoch": 4.044889129259059, + "grad_norm": 3.046875, + "learning_rate": 0.0036691351183795007, + "loss": 2.9894, + "mean_token_accuracy": 0.41562142968177795, + "num_tokens": 7646417129.0, + "step": 14958 + }, + { + "epoch": 4.045159545700378, + "grad_norm": 3.25, + "learning_rate": 0.003668220699007231, + "loss": 2.8992, + "mean_token_accuracy": 0.4282693862915039, + "num_tokens": 7646941300.0, + "step": 14959 + }, + { + "epoch": 4.045429962141698, + "grad_norm": 3.3125, + "learning_rate": 0.003667306504595515, + "loss": 2.9292, + "mean_token_accuracy": 0.43086132407188416, + "num_tokens": 7647465575.0, + "step": 14960 + }, + { + "epoch": 4.0457003785830175, + "grad_norm": 3.078125, + "learning_rate": 0.003666392535172396, + "loss": 2.9269, + "mean_token_accuracy": 0.4398108720779419, + "num_tokens": 7647989786.0, + "step": 14961 + }, + { + "epoch": 4.0459707950243375, + "grad_norm": 3.953125, + "learning_rate": 0.0036654787907659226, + "loss": 2.8044, + "mean_token_accuracy": 0.42077234387397766, + "num_tokens": 7648490190.0, + "step": 14962 + }, + { + "epoch": 4.046241211465657, + "grad_norm": 3.515625, + "learning_rate": 0.0036645652714041277, + "loss": 3.0247, + "mean_token_accuracy": 0.42099982500076294, + "num_tokens": 7649014367.0, + "step": 14963 + }, + { + "epoch": 4.046511627906977, + "grad_norm": 3.609375, + "learning_rate": 0.0036636519771150455, + "loss": 2.9304, + "mean_token_accuracy": 0.4314744472503662, + "num_tokens": 7649538449.0, + "step": 14964 + }, + { + "epoch": 4.046782044348296, + "grad_norm": 3.46875, + "learning_rate": 0.003662738907926692, + "loss": 2.7051, + "mean_token_accuracy": 0.45796310901641846, + "num_tokens": 7649964186.0, + "step": 14965 + }, + { + "epoch": 4.047052460789616, + "grad_norm": 2.75, + "learning_rate": 0.0036618260638670895, + "loss": 2.8591, + "mean_token_accuracy": 0.4465416669845581, + "num_tokens": 7650488337.0, + "step": 14966 + }, + { + "epoch": 4.047322877230935, + "grad_norm": 2.953125, + "learning_rate": 0.0036609134449642412, + "loss": 2.7963, + "mean_token_accuracy": 0.4549742639064789, + "num_tokens": 7650913497.0, + "step": 14967 + }, + { + "epoch": 4.047593293672255, + "grad_norm": 2.890625, + "learning_rate": 0.0036600010512461523, + "loss": 2.7229, + "mean_token_accuracy": 0.43991631269454956, + "num_tokens": 7651437530.0, + "step": 14968 + }, + { + "epoch": 4.047863710113575, + "grad_norm": 2.9375, + "learning_rate": 0.0036590888827408175, + "loss": 2.9123, + "mean_token_accuracy": 0.4418795704841614, + "num_tokens": 7651961786.0, + "step": 14969 + }, + { + "epoch": 4.048134126554895, + "grad_norm": 4.4375, + "learning_rate": 0.0036581769394762216, + "loss": 2.8181, + "mean_token_accuracy": 0.42125093936920166, + "num_tokens": 7652485990.0, + "step": 14970 + }, + { + "epoch": 4.048404542996214, + "grad_norm": 121.5, + "learning_rate": 0.0036572652214803504, + "loss": 11.4119, + "mean_token_accuracy": 0.0036159800365567207, + "num_tokens": 7653010215.0, + "step": 14971 + }, + { + "epoch": 4.048674959437534, + "grad_norm": 6.75, + "learning_rate": 0.0036563537287811754, + "loss": 2.7609, + "mean_token_accuracy": 0.43681269884109497, + "num_tokens": 7653529954.0, + "step": 14972 + }, + { + "epoch": 4.048945375878853, + "grad_norm": 2.875, + "learning_rate": 0.003655442461406663, + "loss": 2.9542, + "mean_token_accuracy": 0.44834399223327637, + "num_tokens": 7653991610.0, + "step": 14973 + }, + { + "epoch": 4.049215792320173, + "grad_norm": 3.265625, + "learning_rate": 0.0036545314193847755, + "loss": 2.8672, + "mean_token_accuracy": 0.4218888580799103, + "num_tokens": 7654483939.0, + "step": 14974 + }, + { + "epoch": 4.0494862087614925, + "grad_norm": 3.65625, + "learning_rate": 0.003653620602743467, + "loss": 3.0604, + "mean_token_accuracy": 0.420192152261734, + "num_tokens": 7654970031.0, + "step": 14975 + }, + { + "epoch": 4.0497566252028125, + "grad_norm": 3.34375, + "learning_rate": 0.0036527100115106804, + "loss": 2.6924, + "mean_token_accuracy": 0.44779688119888306, + "num_tokens": 7655467107.0, + "step": 14976 + }, + { + "epoch": 4.050027041644132, + "grad_norm": 9.1875, + "learning_rate": 0.0036517996457143577, + "loss": 2.7809, + "mean_token_accuracy": 0.4408753514289856, + "num_tokens": 7655991215.0, + "step": 14977 + }, + { + "epoch": 4.050297458085452, + "grad_norm": 2.453125, + "learning_rate": 0.003650889505382432, + "loss": 2.8242, + "mean_token_accuracy": 0.43939441442489624, + "num_tokens": 7656515246.0, + "step": 14978 + }, + { + "epoch": 4.050567874526771, + "grad_norm": 3.578125, + "learning_rate": 0.003649979590542828, + "loss": 2.9286, + "mean_token_accuracy": 0.42759978771209717, + "num_tokens": 7656986961.0, + "step": 14979 + }, + { + "epoch": 4.050838290968091, + "grad_norm": 3.671875, + "learning_rate": 0.0036490699012234674, + "loss": 2.8138, + "mean_token_accuracy": 0.44683951139450073, + "num_tokens": 7657511180.0, + "step": 14980 + }, + { + "epoch": 4.05110870740941, + "grad_norm": 3.71875, + "learning_rate": 0.00364816043745226, + "loss": 2.7762, + "mean_token_accuracy": 0.44320762157440186, + "num_tokens": 7658035406.0, + "step": 14981 + }, + { + "epoch": 4.05137912385073, + "grad_norm": 3.453125, + "learning_rate": 0.003647251199257108, + "loss": 2.9575, + "mean_token_accuracy": 0.43062081933021545, + "num_tokens": 7658559670.0, + "step": 14982 + }, + { + "epoch": 4.05164954029205, + "grad_norm": 4.25, + "learning_rate": 0.003646342186665915, + "loss": 2.6843, + "mean_token_accuracy": 0.45621350407600403, + "num_tokens": 7659083917.0, + "step": 14983 + }, + { + "epoch": 4.05191995673337, + "grad_norm": 3.375, + "learning_rate": 0.0036454333997065686, + "loss": 2.8168, + "mean_token_accuracy": 0.4153732657432556, + "num_tokens": 7659608043.0, + "step": 14984 + }, + { + "epoch": 4.052190373174689, + "grad_norm": 3.328125, + "learning_rate": 0.0036445248384069516, + "loss": 2.9033, + "mean_token_accuracy": 0.43862640857696533, + "num_tokens": 7660132208.0, + "step": 14985 + }, + { + "epoch": 4.052460789616009, + "grad_norm": 4.0625, + "learning_rate": 0.003643616502794947, + "loss": 2.686, + "mean_token_accuracy": 0.4746452569961548, + "num_tokens": 7660656391.0, + "step": 14986 + }, + { + "epoch": 4.052731206057328, + "grad_norm": 2.421875, + "learning_rate": 0.0036427083928984205, + "loss": 2.9773, + "mean_token_accuracy": 0.4378439784049988, + "num_tokens": 7661140922.0, + "step": 14987 + }, + { + "epoch": 4.053001622498648, + "grad_norm": 3.109375, + "learning_rate": 0.0036418005087452353, + "loss": 2.7636, + "mean_token_accuracy": 0.44270145893096924, + "num_tokens": 7661665002.0, + "step": 14988 + }, + { + "epoch": 4.0532720389399675, + "grad_norm": 4.375, + "learning_rate": 0.003640892850363252, + "loss": 2.7101, + "mean_token_accuracy": 0.45858234167099, + "num_tokens": 7662189238.0, + "step": 14989 + }, + { + "epoch": 4.0535424553812875, + "grad_norm": 2.375, + "learning_rate": 0.0036399854177803148, + "loss": 2.762, + "mean_token_accuracy": 0.4494816064834595, + "num_tokens": 7662713454.0, + "step": 14990 + }, + { + "epoch": 4.053812871822607, + "grad_norm": 24.0, + "learning_rate": 0.0036390782110242727, + "loss": 10.8038, + "mean_token_accuracy": 1.653604158491362e-05, + "num_tokens": 7663237648.0, + "step": 14991 + }, + { + "epoch": 4.054083288263927, + "grad_norm": 6.59375, + "learning_rate": 0.0036381712301229554, + "loss": 2.7905, + "mean_token_accuracy": 0.44626864790916443, + "num_tokens": 7663749115.0, + "step": 14992 + }, + { + "epoch": 4.054353704705246, + "grad_norm": 2.53125, + "learning_rate": 0.0036372644751041937, + "loss": 3.0304, + "mean_token_accuracy": 0.4234502911567688, + "num_tokens": 7664273335.0, + "step": 14993 + }, + { + "epoch": 4.054624121146566, + "grad_norm": 3.265625, + "learning_rate": 0.0036363579459958127, + "loss": 2.9071, + "mean_token_accuracy": 0.42610734701156616, + "num_tokens": 7664797607.0, + "step": 14994 + }, + { + "epoch": 4.054894537587885, + "grad_norm": 4.71875, + "learning_rate": 0.003635451642825627, + "loss": 2.6447, + "mean_token_accuracy": 0.47028839588165283, + "num_tokens": 7665321823.0, + "step": 14995 + }, + { + "epoch": 4.055164954029205, + "grad_norm": 3.578125, + "learning_rate": 0.0036345455656214376, + "loss": 3.0111, + "mean_token_accuracy": 0.4190894365310669, + "num_tokens": 7665846102.0, + "step": 14996 + }, + { + "epoch": 4.055435370470525, + "grad_norm": 3.46875, + "learning_rate": 0.0036336397144110543, + "loss": 2.7822, + "mean_token_accuracy": 0.4338136315345764, + "num_tokens": 7666370362.0, + "step": 14997 + }, + { + "epoch": 4.055705786911844, + "grad_norm": 3.296875, + "learning_rate": 0.0036327340892222683, + "loss": 2.6519, + "mean_token_accuracy": 0.4433462917804718, + "num_tokens": 7666894548.0, + "step": 14998 + }, + { + "epoch": 4.055976203353164, + "grad_norm": 2.734375, + "learning_rate": 0.0036318286900828625, + "loss": 2.7916, + "mean_token_accuracy": 0.4450022280216217, + "num_tokens": 7667418754.0, + "step": 14999 + }, + { + "epoch": 4.056246619794483, + "grad_norm": 3.5625, + "learning_rate": 0.003630923517020624, + "loss": 2.8195, + "mean_token_accuracy": 0.4289833903312683, + "num_tokens": 7667942853.0, + "step": 15000 + }, + { + "epoch": 4.056517036235803, + "grad_norm": 4.0625, + "learning_rate": 0.0036300185700633225, + "loss": 2.9092, + "mean_token_accuracy": 0.3852638900279999, + "num_tokens": 7668467132.0, + "step": 15001 + }, + { + "epoch": 4.056787452677122, + "grad_norm": 3.5625, + "learning_rate": 0.0036291138492387234, + "loss": 2.8339, + "mean_token_accuracy": 0.43240582942962646, + "num_tokens": 7668991317.0, + "step": 15002 + }, + { + "epoch": 4.0570578691184425, + "grad_norm": 3.296875, + "learning_rate": 0.0036282093545745887, + "loss": 2.6855, + "mean_token_accuracy": 0.4480079412460327, + "num_tokens": 7669492686.0, + "step": 15003 + }, + { + "epoch": 4.057328285559762, + "grad_norm": 3.859375, + "learning_rate": 0.003627305086098669, + "loss": 2.996, + "mean_token_accuracy": 0.42286795377731323, + "num_tokens": 7670012917.0, + "step": 15004 + }, + { + "epoch": 4.057598702001082, + "grad_norm": 3.09375, + "learning_rate": 0.003626401043838713, + "loss": 2.76, + "mean_token_accuracy": 0.42820310592651367, + "num_tokens": 7670536946.0, + "step": 15005 + }, + { + "epoch": 4.057869118442401, + "grad_norm": 3.265625, + "learning_rate": 0.003625497227822454, + "loss": 2.8879, + "mean_token_accuracy": 0.45124202966690063, + "num_tokens": 7671061042.0, + "step": 15006 + }, + { + "epoch": 4.058139534883721, + "grad_norm": 3.375, + "learning_rate": 0.0036245936380776313, + "loss": 2.7733, + "mean_token_accuracy": 0.4602791666984558, + "num_tokens": 7671517137.0, + "step": 15007 + }, + { + "epoch": 4.05840995132504, + "grad_norm": 3.203125, + "learning_rate": 0.003623690274631961, + "loss": 2.8362, + "mean_token_accuracy": 0.43305397033691406, + "num_tokens": 7672041308.0, + "step": 15008 + }, + { + "epoch": 4.05868036776636, + "grad_norm": 3.046875, + "learning_rate": 0.0036227871375131694, + "loss": 2.6468, + "mean_token_accuracy": 0.45448872447013855, + "num_tokens": 7672490628.0, + "step": 15009 + }, + { + "epoch": 4.0589507842076795, + "grad_norm": 4.1875, + "learning_rate": 0.0036218842267489626, + "loss": 2.7614, + "mean_token_accuracy": 0.456307053565979, + "num_tokens": 7672989909.0, + "step": 15010 + }, + { + "epoch": 4.059221200649, + "grad_norm": 41.25, + "learning_rate": 0.0036209815423670433, + "loss": 11.2604, + "mean_token_accuracy": 0.021490905433893204, + "num_tokens": 7673514153.0, + "step": 15011 + }, + { + "epoch": 4.059491617090319, + "grad_norm": 5.4375, + "learning_rate": 0.0036200790843951124, + "loss": 2.8989, + "mean_token_accuracy": 0.4282626509666443, + "num_tokens": 7674038438.0, + "step": 15012 + }, + { + "epoch": 4.059762033531639, + "grad_norm": 2.875, + "learning_rate": 0.003619176852860856, + "loss": 2.8809, + "mean_token_accuracy": 0.4230807423591614, + "num_tokens": 7674562582.0, + "step": 15013 + }, + { + "epoch": 4.060032449972958, + "grad_norm": 4.09375, + "learning_rate": 0.0036182748477919624, + "loss": 2.8925, + "mean_token_accuracy": 0.44532841444015503, + "num_tokens": 7675086731.0, + "step": 15014 + }, + { + "epoch": 4.060302866414278, + "grad_norm": 4.09375, + "learning_rate": 0.0036173730692161035, + "loss": 2.9579, + "mean_token_accuracy": 0.40349408984184265, + "num_tokens": 7675610978.0, + "step": 15015 + }, + { + "epoch": 4.060573282855597, + "grad_norm": 2.515625, + "learning_rate": 0.0036164715171609486, + "loss": 2.8214, + "mean_token_accuracy": 0.44028961658477783, + "num_tokens": 7676135260.0, + "step": 15016 + }, + { + "epoch": 4.0608436992969175, + "grad_norm": 3.25, + "learning_rate": 0.0036155701916541624, + "loss": 2.7961, + "mean_token_accuracy": 0.45153874158859253, + "num_tokens": 7676555588.0, + "step": 15017 + }, + { + "epoch": 4.061114115738237, + "grad_norm": 2.890625, + "learning_rate": 0.0036146690927233997, + "loss": 2.6274, + "mean_token_accuracy": 0.46848392486572266, + "num_tokens": 7677059930.0, + "step": 15018 + }, + { + "epoch": 4.061384532179557, + "grad_norm": 3.03125, + "learning_rate": 0.003613768220396305, + "loss": 2.6363, + "mean_token_accuracy": 0.4330577254295349, + "num_tokens": 7677584061.0, + "step": 15019 + }, + { + "epoch": 4.061654948620876, + "grad_norm": 3.359375, + "learning_rate": 0.0036128675747005234, + "loss": 2.841, + "mean_token_accuracy": 0.44001010060310364, + "num_tokens": 7678095622.0, + "step": 15020 + }, + { + "epoch": 4.061925365062196, + "grad_norm": 3.140625, + "learning_rate": 0.00361196715566369, + "loss": 2.898, + "mean_token_accuracy": 0.42457669973373413, + "num_tokens": 7678619811.0, + "step": 15021 + }, + { + "epoch": 4.062195781503515, + "grad_norm": 3.375, + "learning_rate": 0.0036110669633134308, + "loss": 3.0015, + "mean_token_accuracy": 0.42675337195396423, + "num_tokens": 7679144089.0, + "step": 15022 + }, + { + "epoch": 4.062466197944835, + "grad_norm": 4.15625, + "learning_rate": 0.0036101669976773675, + "loss": 2.6939, + "mean_token_accuracy": 0.4325074553489685, + "num_tokens": 7679668171.0, + "step": 15023 + }, + { + "epoch": 4.0627366143861545, + "grad_norm": 2.46875, + "learning_rate": 0.003609267258783113, + "loss": 2.8919, + "mean_token_accuracy": 0.43892958760261536, + "num_tokens": 7680192416.0, + "step": 15024 + }, + { + "epoch": 4.063007030827475, + "grad_norm": 3.71875, + "learning_rate": 0.003608367746658271, + "loss": 2.7252, + "mean_token_accuracy": 0.49006277322769165, + "num_tokens": 7680578378.0, + "step": 15025 + }, + { + "epoch": 4.063277447268794, + "grad_norm": 3.515625, + "learning_rate": 0.0036074684613304465, + "loss": 2.8532, + "mean_token_accuracy": 0.43020594120025635, + "num_tokens": 7681102497.0, + "step": 15026 + }, + { + "epoch": 4.063547863710114, + "grad_norm": 4.34375, + "learning_rate": 0.00360656940282723, + "loss": 2.7349, + "mean_token_accuracy": 0.46343502402305603, + "num_tokens": 7681590736.0, + "step": 15027 + }, + { + "epoch": 4.063818280151433, + "grad_norm": 3.6875, + "learning_rate": 0.003605670571176205, + "loss": 2.825, + "mean_token_accuracy": 0.42814111709594727, + "num_tokens": 7682114967.0, + "step": 15028 + }, + { + "epoch": 4.064088696592753, + "grad_norm": 3.109375, + "learning_rate": 0.003604771966404954, + "loss": 2.8444, + "mean_token_accuracy": 0.4389001727104187, + "num_tokens": 7682639195.0, + "step": 15029 + }, + { + "epoch": 4.064359113034072, + "grad_norm": 3.71875, + "learning_rate": 0.0036038735885410463, + "loss": 2.7765, + "mean_token_accuracy": 0.42775774002075195, + "num_tokens": 7683163364.0, + "step": 15030 + }, + { + "epoch": 4.0646295294753925, + "grad_norm": 59.25, + "learning_rate": 0.003602975437612046, + "loss": 11.1945, + "mean_token_accuracy": 0.0011185023467987776, + "num_tokens": 7683687494.0, + "step": 15031 + }, + { + "epoch": 4.064899945916712, + "grad_norm": 5.3125, + "learning_rate": 0.0036020775136455154, + "loss": 2.9288, + "mean_token_accuracy": 0.4412970542907715, + "num_tokens": 7684210976.0, + "step": 15032 + }, + { + "epoch": 4.065170362358032, + "grad_norm": 2.328125, + "learning_rate": 0.0036011798166689993, + "loss": 2.7078, + "mean_token_accuracy": 0.4726096987724304, + "num_tokens": 7684735201.0, + "step": 15033 + }, + { + "epoch": 4.065440778799351, + "grad_norm": 3.8125, + "learning_rate": 0.003600282346710044, + "loss": 2.8918, + "mean_token_accuracy": 0.42666539549827576, + "num_tokens": 7685259305.0, + "step": 15034 + }, + { + "epoch": 4.065711195240671, + "grad_norm": 3.40625, + "learning_rate": 0.003599385103796191, + "loss": 2.9515, + "mean_token_accuracy": 0.414188027381897, + "num_tokens": 7685783575.0, + "step": 15035 + }, + { + "epoch": 4.06598161168199, + "grad_norm": 3.46875, + "learning_rate": 0.003598488087954964, + "loss": 3.0494, + "mean_token_accuracy": 0.4222838878631592, + "num_tokens": 7686270814.0, + "step": 15036 + }, + { + "epoch": 4.06625202812331, + "grad_norm": 4.375, + "learning_rate": 0.0035975912992138926, + "loss": 2.8257, + "mean_token_accuracy": 0.4653632640838623, + "num_tokens": 7686731394.0, + "step": 15037 + }, + { + "epoch": 4.0665224445646295, + "grad_norm": 3.03125, + "learning_rate": 0.0035966947376004877, + "loss": 2.7849, + "mean_token_accuracy": 0.440654456615448, + "num_tokens": 7687255666.0, + "step": 15038 + }, + { + "epoch": 4.066792861005949, + "grad_norm": 3.09375, + "learning_rate": 0.003595798403142258, + "loss": 2.7768, + "mean_token_accuracy": 0.44669005274772644, + "num_tokens": 7687779879.0, + "step": 15039 + }, + { + "epoch": 4.067063277447269, + "grad_norm": 3.078125, + "learning_rate": 0.00359490229586671, + "loss": 2.6874, + "mean_token_accuracy": 0.42691636085510254, + "num_tokens": 7688299930.0, + "step": 15040 + }, + { + "epoch": 4.067333693888588, + "grad_norm": 3.203125, + "learning_rate": 0.0035940064158013364, + "loss": 2.6645, + "mean_token_accuracy": 0.45198264718055725, + "num_tokens": 7688772946.0, + "step": 15041 + }, + { + "epoch": 4.067604110329908, + "grad_norm": 38.5, + "learning_rate": 0.0035931107629736236, + "loss": 3.0903, + "mean_token_accuracy": 0.4132441580295563, + "num_tokens": 7689297187.0, + "step": 15042 + }, + { + "epoch": 4.067874526771227, + "grad_norm": 4.75, + "learning_rate": 0.0035922153374110567, + "loss": 2.8416, + "mean_token_accuracy": 0.44582048058509827, + "num_tokens": 7689821355.0, + "step": 15043 + }, + { + "epoch": 4.068144943212547, + "grad_norm": 3.234375, + "learning_rate": 0.003591320139141108, + "loss": 2.9003, + "mean_token_accuracy": 0.4173007607460022, + "num_tokens": 7690345538.0, + "step": 15044 + }, + { + "epoch": 4.068415359653867, + "grad_norm": 3.921875, + "learning_rate": 0.0035904251681912434, + "loss": 3.0084, + "mean_token_accuracy": 0.4284999668598175, + "num_tokens": 7690869693.0, + "step": 15045 + }, + { + "epoch": 4.068685776095187, + "grad_norm": 3.96875, + "learning_rate": 0.003589530424588927, + "loss": 2.8321, + "mean_token_accuracy": 0.4445236623287201, + "num_tokens": 7691393943.0, + "step": 15046 + }, + { + "epoch": 4.068956192536506, + "grad_norm": 4.0, + "learning_rate": 0.003588635908361607, + "loss": 2.8459, + "mean_token_accuracy": 0.44072866439819336, + "num_tokens": 7691910143.0, + "step": 15047 + }, + { + "epoch": 4.069226608977826, + "grad_norm": 19.125, + "learning_rate": 0.0035877416195367363, + "loss": 2.8105, + "mean_token_accuracy": 0.44019240140914917, + "num_tokens": 7692434308.0, + "step": 15048 + }, + { + "epoch": 4.069497025419145, + "grad_norm": 5.65625, + "learning_rate": 0.003586847558141747, + "loss": 2.8244, + "mean_token_accuracy": 0.4514715075492859, + "num_tokens": 7692958565.0, + "step": 15049 + }, + { + "epoch": 4.069767441860465, + "grad_norm": 2.875, + "learning_rate": 0.0035859537242040776, + "loss": 2.9294, + "mean_token_accuracy": 0.43228471279144287, + "num_tokens": 7693482787.0, + "step": 15050 + }, + { + "epoch": 4.070037858301784, + "grad_norm": 20.0, + "learning_rate": 0.0035850601177511493, + "loss": 9.2773, + "mean_token_accuracy": 0.04115840047597885, + "num_tokens": 7694007042.0, + "step": 15051 + }, + { + "epoch": 4.0703082747431045, + "grad_norm": 7.46875, + "learning_rate": 0.0035841667388103854, + "loss": 3.0204, + "mean_token_accuracy": 0.427599161863327, + "num_tokens": 7694531126.0, + "step": 15052 + }, + { + "epoch": 4.070578691184424, + "grad_norm": 2.6875, + "learning_rate": 0.003583273587409194, + "loss": 2.846, + "mean_token_accuracy": 0.45195600390434265, + "num_tokens": 7695055304.0, + "step": 15053 + }, + { + "epoch": 4.070849107625744, + "grad_norm": 3.375, + "learning_rate": 0.003582380663574978, + "loss": 2.6826, + "mean_token_accuracy": 0.4442976117134094, + "num_tokens": 7695579535.0, + "step": 15054 + }, + { + "epoch": 4.071119524067063, + "grad_norm": 3.765625, + "learning_rate": 0.0035814879673351386, + "loss": 3.0182, + "mean_token_accuracy": 0.4246591329574585, + "num_tokens": 7696068347.0, + "step": 15055 + }, + { + "epoch": 4.071389940508383, + "grad_norm": 4.15625, + "learning_rate": 0.0035805954987170666, + "loss": 2.9329, + "mean_token_accuracy": 0.4168638586997986, + "num_tokens": 7696592481.0, + "step": 15056 + }, + { + "epoch": 4.071660356949702, + "grad_norm": 3.734375, + "learning_rate": 0.003579703257748139, + "loss": 2.8915, + "mean_token_accuracy": 0.4411334991455078, + "num_tokens": 7697086731.0, + "step": 15057 + }, + { + "epoch": 4.071930773391022, + "grad_norm": 3.515625, + "learning_rate": 0.0035788112444557407, + "loss": 2.7605, + "mean_token_accuracy": 0.44817325472831726, + "num_tokens": 7697610909.0, + "step": 15058 + }, + { + "epoch": 4.072201189832342, + "grad_norm": 3.375, + "learning_rate": 0.0035779194588672347, + "loss": 2.8215, + "mean_token_accuracy": 0.42995592951774597, + "num_tokens": 7698135088.0, + "step": 15059 + }, + { + "epoch": 4.072471606273662, + "grad_norm": 2.984375, + "learning_rate": 0.0035770279010099906, + "loss": 2.8038, + "mean_token_accuracy": 0.4363866448402405, + "num_tokens": 7698659280.0, + "step": 15060 + }, + { + "epoch": 4.072742022714981, + "grad_norm": 2.984375, + "learning_rate": 0.003576136570911358, + "loss": 2.7325, + "mean_token_accuracy": 0.45499154925346375, + "num_tokens": 7699123535.0, + "step": 15061 + }, + { + "epoch": 4.073012439156301, + "grad_norm": 4.1875, + "learning_rate": 0.0035752454685986867, + "loss": 3.0382, + "mean_token_accuracy": 0.4214954376220703, + "num_tokens": 7699647785.0, + "step": 15062 + }, + { + "epoch": 4.07328285559762, + "grad_norm": 5.1875, + "learning_rate": 0.0035743545940993193, + "loss": 2.9541, + "mean_token_accuracy": 0.4128360152244568, + "num_tokens": 7700171976.0, + "step": 15063 + }, + { + "epoch": 4.07355327203894, + "grad_norm": 3.359375, + "learning_rate": 0.0035734639474405915, + "loss": 2.7511, + "mean_token_accuracy": 0.4491513967514038, + "num_tokens": 7700696062.0, + "step": 15064 + }, + { + "epoch": 4.073823688480259, + "grad_norm": 3.515625, + "learning_rate": 0.0035725735286498282, + "loss": 2.8245, + "mean_token_accuracy": 0.45119142532348633, + "num_tokens": 7701161185.0, + "step": 15065 + }, + { + "epoch": 4.0740941049215795, + "grad_norm": 3.265625, + "learning_rate": 0.0035716833377543535, + "loss": 2.7612, + "mean_token_accuracy": 0.42499881982803345, + "num_tokens": 7701685441.0, + "step": 15066 + }, + { + "epoch": 4.074364521362899, + "grad_norm": 3.09375, + "learning_rate": 0.00357079337478148, + "loss": 2.8282, + "mean_token_accuracy": 0.4408389925956726, + "num_tokens": 7702153287.0, + "step": 15067 + }, + { + "epoch": 4.074634937804219, + "grad_norm": 3.90625, + "learning_rate": 0.0035699036397585106, + "loss": 2.9138, + "mean_token_accuracy": 0.4297974407672882, + "num_tokens": 7702677450.0, + "step": 15068 + }, + { + "epoch": 4.074905354245538, + "grad_norm": 3.375, + "learning_rate": 0.003569014132712751, + "loss": 2.7422, + "mean_token_accuracy": 0.45194223523139954, + "num_tokens": 7703160074.0, + "step": 15069 + }, + { + "epoch": 4.075175770686858, + "grad_norm": 3.9375, + "learning_rate": 0.0035681248536714918, + "loss": 2.8979, + "mean_token_accuracy": 0.4477369487285614, + "num_tokens": 7703684143.0, + "step": 15070 + }, + { + "epoch": 4.075446187128177, + "grad_norm": 11.0625, + "learning_rate": 0.0035672358026620134, + "loss": 9.3471, + "mean_token_accuracy": 0.013768711127340794, + "num_tokens": 7704186496.0, + "step": 15071 + }, + { + "epoch": 4.075716603569497, + "grad_norm": 7.75, + "learning_rate": 0.003566346979711602, + "loss": 2.8954, + "mean_token_accuracy": 0.4327499270439148, + "num_tokens": 7704710702.0, + "step": 15072 + }, + { + "epoch": 4.075987020010817, + "grad_norm": 2.5, + "learning_rate": 0.0035654583848475263, + "loss": 2.8221, + "mean_token_accuracy": 0.44293707609176636, + "num_tokens": 7705234925.0, + "step": 15073 + }, + { + "epoch": 4.076257436452137, + "grad_norm": 4.09375, + "learning_rate": 0.003564570018097049, + "loss": 2.8681, + "mean_token_accuracy": 0.43571922183036804, + "num_tokens": 7705758998.0, + "step": 15074 + }, + { + "epoch": 4.076527852893456, + "grad_norm": 2.671875, + "learning_rate": 0.0035636818794874316, + "loss": 2.9213, + "mean_token_accuracy": 0.4769887328147888, + "num_tokens": 7706195696.0, + "step": 15075 + }, + { + "epoch": 4.076798269334776, + "grad_norm": 3.015625, + "learning_rate": 0.0035627939690459206, + "loss": 2.876, + "mean_token_accuracy": 0.4386545419692993, + "num_tokens": 7706719868.0, + "step": 15076 + }, + { + "epoch": 4.077068685776095, + "grad_norm": 3.046875, + "learning_rate": 0.0035619062867997624, + "loss": 2.7502, + "mean_token_accuracy": 0.43783873319625854, + "num_tokens": 7707224073.0, + "step": 15077 + }, + { + "epoch": 4.077339102217415, + "grad_norm": 2.984375, + "learning_rate": 0.003561018832776195, + "loss": 2.8789, + "mean_token_accuracy": 0.4453110098838806, + "num_tokens": 7707725257.0, + "step": 15078 + }, + { + "epoch": 4.0776095186587344, + "grad_norm": 3.859375, + "learning_rate": 0.003560131607002447, + "loss": 2.8029, + "mean_token_accuracy": 0.42559340596199036, + "num_tokens": 7708249447.0, + "step": 15079 + }, + { + "epoch": 4.077879935100054, + "grad_norm": 2.59375, + "learning_rate": 0.0035592446095057385, + "loss": 2.6384, + "mean_token_accuracy": 0.4449857473373413, + "num_tokens": 7708773688.0, + "step": 15080 + }, + { + "epoch": 4.078150351541374, + "grad_norm": 3.5625, + "learning_rate": 0.003558357840313289, + "loss": 2.9524, + "mean_token_accuracy": 0.418557345867157, + "num_tokens": 7709297877.0, + "step": 15081 + }, + { + "epoch": 4.078420767982693, + "grad_norm": 3.359375, + "learning_rate": 0.0035574712994523028, + "loss": 2.8996, + "mean_token_accuracy": 0.43509864807128906, + "num_tokens": 7709822115.0, + "step": 15082 + }, + { + "epoch": 4.078691184424013, + "grad_norm": 3.4375, + "learning_rate": 0.003556584986949987, + "loss": 2.8047, + "mean_token_accuracy": 0.43128716945648193, + "num_tokens": 7710346129.0, + "step": 15083 + }, + { + "epoch": 4.078961600865332, + "grad_norm": 3.25, + "learning_rate": 0.0035556989028335336, + "loss": 2.8341, + "mean_token_accuracy": 0.41990089416503906, + "num_tokens": 7710870315.0, + "step": 15084 + }, + { + "epoch": 4.079232017306652, + "grad_norm": 3.3125, + "learning_rate": 0.0035548130471301266, + "loss": 2.9319, + "mean_token_accuracy": 0.4247768521308899, + "num_tokens": 7711394496.0, + "step": 15085 + }, + { + "epoch": 4.0795024337479715, + "grad_norm": 3.421875, + "learning_rate": 0.0035539274198669534, + "loss": 2.695, + "mean_token_accuracy": 0.4306863844394684, + "num_tokens": 7711918674.0, + "step": 15086 + }, + { + "epoch": 4.079772850189292, + "grad_norm": 3.0625, + "learning_rate": 0.003553042021071185, + "loss": 2.7162, + "mean_token_accuracy": 0.425807386636734, + "num_tokens": 7712442929.0, + "step": 15087 + }, + { + "epoch": 4.080043266630611, + "grad_norm": 3.09375, + "learning_rate": 0.0035521568507699843, + "loss": 3.1141, + "mean_token_accuracy": 0.40555596351623535, + "num_tokens": 7712944065.0, + "step": 15088 + }, + { + "epoch": 4.080313683071931, + "grad_norm": 3.484375, + "learning_rate": 0.0035512719089905165, + "loss": 2.7937, + "mean_token_accuracy": 0.4413430690765381, + "num_tokens": 7713468098.0, + "step": 15089 + }, + { + "epoch": 4.08058409951325, + "grad_norm": 3.53125, + "learning_rate": 0.00355038719575993, + "loss": 2.8489, + "mean_token_accuracy": 0.440671443939209, + "num_tokens": 7713972896.0, + "step": 15090 + }, + { + "epoch": 4.08085451595457, + "grad_norm": 14.375, + "learning_rate": 0.0035495027111053746, + "loss": 9.2677, + "mean_token_accuracy": 0.013869913294911385, + "num_tokens": 7714497111.0, + "step": 15091 + }, + { + "epoch": 4.081124932395889, + "grad_norm": 6.84375, + "learning_rate": 0.0035486184550539836, + "loss": 3.0202, + "mean_token_accuracy": 0.4513451159000397, + "num_tokens": 7714918589.0, + "step": 15092 + }, + { + "epoch": 4.0813953488372094, + "grad_norm": 2.734375, + "learning_rate": 0.003547734427632894, + "loss": 2.6625, + "mean_token_accuracy": 0.46806877851486206, + "num_tokens": 7715336932.0, + "step": 15093 + }, + { + "epoch": 4.081665765278529, + "grad_norm": 3.78125, + "learning_rate": 0.003546850628869226, + "loss": 3.014, + "mean_token_accuracy": 0.4291249215602875, + "num_tokens": 7715861182.0, + "step": 15094 + }, + { + "epoch": 4.081936181719849, + "grad_norm": 3.75, + "learning_rate": 0.0035459670587901005, + "loss": 2.7969, + "mean_token_accuracy": 0.4323081970214844, + "num_tokens": 7716348241.0, + "step": 15095 + }, + { + "epoch": 4.082206598161168, + "grad_norm": 3.9375, + "learning_rate": 0.0035450837174226257, + "loss": 2.8208, + "mean_token_accuracy": 0.4201643466949463, + "num_tokens": 7716872400.0, + "step": 15096 + }, + { + "epoch": 4.082477014602488, + "grad_norm": 3.640625, + "learning_rate": 0.003544200604793906, + "loss": 3.0249, + "mean_token_accuracy": 0.40146592259407043, + "num_tokens": 7717379916.0, + "step": 15097 + }, + { + "epoch": 4.082747431043807, + "grad_norm": 3.390625, + "learning_rate": 0.0035433177209310386, + "loss": 2.8212, + "mean_token_accuracy": 0.4226463735103607, + "num_tokens": 7717904103.0, + "step": 15098 + }, + { + "epoch": 4.083017847485127, + "grad_norm": 2.953125, + "learning_rate": 0.0035424350658611126, + "loss": 2.9305, + "mean_token_accuracy": 0.44642317295074463, + "num_tokens": 7718333551.0, + "step": 15099 + }, + { + "epoch": 4.0832882639264465, + "grad_norm": 2.71875, + "learning_rate": 0.003541552639611208, + "loss": 2.8658, + "mean_token_accuracy": 0.4059605002403259, + "num_tokens": 7718857749.0, + "step": 15100 + }, + { + "epoch": 4.083558680367767, + "grad_norm": 3.265625, + "learning_rate": 0.0035406704422084037, + "loss": 2.7686, + "mean_token_accuracy": 0.4598633348941803, + "num_tokens": 7719343880.0, + "step": 15101 + }, + { + "epoch": 4.083829096809086, + "grad_norm": 3.265625, + "learning_rate": 0.0035397884736797676, + "loss": 2.7753, + "mean_token_accuracy": 0.4370823800563812, + "num_tokens": 7719868006.0, + "step": 15102 + }, + { + "epoch": 4.084099513250406, + "grad_norm": 3.5, + "learning_rate": 0.003538906734052357, + "loss": 2.7053, + "mean_token_accuracy": 0.4382328391075134, + "num_tokens": 7720392240.0, + "step": 15103 + }, + { + "epoch": 4.084369929691725, + "grad_norm": 3.0625, + "learning_rate": 0.00353802522335323, + "loss": 2.9124, + "mean_token_accuracy": 0.4349759817123413, + "num_tokens": 7720916438.0, + "step": 15104 + }, + { + "epoch": 4.084640346133045, + "grad_norm": 3.84375, + "learning_rate": 0.0035371439416094362, + "loss": 2.9809, + "mean_token_accuracy": 0.423342764377594, + "num_tokens": 7721395904.0, + "step": 15105 + }, + { + "epoch": 4.084910762574364, + "grad_norm": 3.4375, + "learning_rate": 0.003536262888848011, + "loss": 2.8178, + "mean_token_accuracy": 0.4233470559120178, + "num_tokens": 7721920185.0, + "step": 15106 + }, + { + "epoch": 4.0851811790156844, + "grad_norm": 3.421875, + "learning_rate": 0.00353538206509599, + "loss": 2.778, + "mean_token_accuracy": 0.44967424869537354, + "num_tokens": 7722444460.0, + "step": 15107 + }, + { + "epoch": 4.085451595457004, + "grad_norm": 3.015625, + "learning_rate": 0.0035345014703803985, + "loss": 2.9247, + "mean_token_accuracy": 0.41718223690986633, + "num_tokens": 7722968666.0, + "step": 15108 + }, + { + "epoch": 4.085722011898324, + "grad_norm": 3.34375, + "learning_rate": 0.0035336211047282597, + "loss": 3.0003, + "mean_token_accuracy": 0.4255342483520508, + "num_tokens": 7723479469.0, + "step": 15109 + }, + { + "epoch": 4.085992428339643, + "grad_norm": 3.5625, + "learning_rate": 0.0035327409681665803, + "loss": 2.9225, + "mean_token_accuracy": 0.43653395771980286, + "num_tokens": 7724003749.0, + "step": 15110 + }, + { + "epoch": 4.086262844780963, + "grad_norm": 7.6875, + "learning_rate": 0.0035318610607223667, + "loss": 9.2445, + "mean_token_accuracy": 0.0017712671542540193, + "num_tokens": 7724527981.0, + "step": 15111 + }, + { + "epoch": 4.086533261222282, + "grad_norm": 7.78125, + "learning_rate": 0.00353098138242262, + "loss": 2.8175, + "mean_token_accuracy": 0.4504333734512329, + "num_tokens": 7724972296.0, + "step": 15112 + }, + { + "epoch": 4.086803677663602, + "grad_norm": 2.765625, + "learning_rate": 0.0035301019332943306, + "loss": 2.7204, + "mean_token_accuracy": 0.454947292804718, + "num_tokens": 7725459460.0, + "step": 15113 + }, + { + "epoch": 4.0870740941049215, + "grad_norm": 4.0625, + "learning_rate": 0.0035292227133644784, + "loss": 2.7823, + "mean_token_accuracy": 0.44028928875923157, + "num_tokens": 7725983684.0, + "step": 15114 + }, + { + "epoch": 4.087344510546242, + "grad_norm": 3.328125, + "learning_rate": 0.0035283437226600457, + "loss": 2.7481, + "mean_token_accuracy": 0.43930694460868835, + "num_tokens": 7726507793.0, + "step": 15115 + }, + { + "epoch": 4.087614926987561, + "grad_norm": 3.921875, + "learning_rate": 0.003527464961208, + "loss": 2.8019, + "mean_token_accuracy": 0.4484129846096039, + "num_tokens": 7727031975.0, + "step": 15116 + }, + { + "epoch": 4.087885343428881, + "grad_norm": 3.421875, + "learning_rate": 0.003526586429035302, + "loss": 2.5618, + "mean_token_accuracy": 0.4590883255004883, + "num_tokens": 7727492762.0, + "step": 15117 + }, + { + "epoch": 4.0881557598702, + "grad_norm": 2.40625, + "learning_rate": 0.003525708126168914, + "loss": 2.8923, + "mean_token_accuracy": 0.43585720658302307, + "num_tokens": 7728017013.0, + "step": 15118 + }, + { + "epoch": 4.08842617631152, + "grad_norm": 3.453125, + "learning_rate": 0.0035248300526357773, + "loss": 2.7365, + "mean_token_accuracy": 0.4248206913471222, + "num_tokens": 7728541219.0, + "step": 15119 + }, + { + "epoch": 4.088696592752839, + "grad_norm": 2.9375, + "learning_rate": 0.0035239522084628385, + "loss": 2.9462, + "mean_token_accuracy": 0.4209699034690857, + "num_tokens": 7729065473.0, + "step": 15120 + }, + { + "epoch": 4.088967009194159, + "grad_norm": 3.78125, + "learning_rate": 0.0035230745936770335, + "loss": 2.7983, + "mean_token_accuracy": 0.43090248107910156, + "num_tokens": 7729589558.0, + "step": 15121 + }, + { + "epoch": 4.089237425635479, + "grad_norm": 3.59375, + "learning_rate": 0.0035221972083052887, + "loss": 2.7961, + "mean_token_accuracy": 0.4194279909133911, + "num_tokens": 7730113838.0, + "step": 15122 + }, + { + "epoch": 4.089507842076798, + "grad_norm": 3.34375, + "learning_rate": 0.0035213200523745228, + "loss": 2.723, + "mean_token_accuracy": 0.4491523206233978, + "num_tokens": 7730631745.0, + "step": 15123 + }, + { + "epoch": 4.089778258518118, + "grad_norm": 3.984375, + "learning_rate": 0.0035204431259116516, + "loss": 2.8283, + "mean_token_accuracy": 0.4557797908782959, + "num_tokens": 7731099761.0, + "step": 15124 + }, + { + "epoch": 4.090048674959437, + "grad_norm": 3.046875, + "learning_rate": 0.003519566428943581, + "loss": 2.7606, + "mean_token_accuracy": 0.4605731666088104, + "num_tokens": 7731513572.0, + "step": 15125 + }, + { + "epoch": 4.090319091400757, + "grad_norm": 3.21875, + "learning_rate": 0.00351868996149721, + "loss": 2.5908, + "mean_token_accuracy": 0.46697092056274414, + "num_tokens": 7732037731.0, + "step": 15126 + }, + { + "epoch": 4.090589507842076, + "grad_norm": 3.140625, + "learning_rate": 0.0035178137235994346, + "loss": 2.8683, + "mean_token_accuracy": 0.43338263034820557, + "num_tokens": 7732561916.0, + "step": 15127 + }, + { + "epoch": 4.0908599242833965, + "grad_norm": 3.640625, + "learning_rate": 0.0035169377152771365, + "loss": 2.8565, + "mean_token_accuracy": 0.43761327862739563, + "num_tokens": 7733086057.0, + "step": 15128 + }, + { + "epoch": 4.091130340724716, + "grad_norm": 3.59375, + "learning_rate": 0.003516061936557194, + "loss": 2.7872, + "mean_token_accuracy": 0.4283403754234314, + "num_tokens": 7733610232.0, + "step": 15129 + }, + { + "epoch": 4.091400757166036, + "grad_norm": 3.53125, + "learning_rate": 0.0035151863874664825, + "loss": 2.9391, + "mean_token_accuracy": 0.42951375246047974, + "num_tokens": 7734118538.0, + "step": 15130 + }, + { + "epoch": 4.091671173607355, + "grad_norm": 7.8125, + "learning_rate": 0.0035143110680318605, + "loss": 8.5946, + "mean_token_accuracy": 0.037736088037490845, + "num_tokens": 7734642607.0, + "step": 15131 + }, + { + "epoch": 4.091941590048675, + "grad_norm": 9.8125, + "learning_rate": 0.0035134359782801926, + "loss": 3.0749, + "mean_token_accuracy": 0.3982442021369934, + "num_tokens": 7735166881.0, + "step": 15132 + }, + { + "epoch": 4.092212006489994, + "grad_norm": 2.953125, + "learning_rate": 0.003512561118238323, + "loss": 2.7659, + "mean_token_accuracy": 0.4432724118232727, + "num_tokens": 7735651546.0, + "step": 15133 + }, + { + "epoch": 4.092482422931314, + "grad_norm": 4.0625, + "learning_rate": 0.003511686487933097, + "loss": 2.8416, + "mean_token_accuracy": 0.454897940158844, + "num_tokens": 7736171921.0, + "step": 15134 + }, + { + "epoch": 4.092752839372634, + "grad_norm": 3.75, + "learning_rate": 0.0035108120873913536, + "loss": 2.7752, + "mean_token_accuracy": 0.4515409767627716, + "num_tokens": 7736696148.0, + "step": 15135 + }, + { + "epoch": 4.093023255813954, + "grad_norm": 3.609375, + "learning_rate": 0.003509937916639919, + "loss": 2.8787, + "mean_token_accuracy": 0.46539342403411865, + "num_tokens": 7737201828.0, + "step": 15136 + }, + { + "epoch": 4.093293672255273, + "grad_norm": 3.0, + "learning_rate": 0.0035090639757056155, + "loss": 2.8773, + "mean_token_accuracy": 0.4344199597835541, + "num_tokens": 7737685902.0, + "step": 15137 + }, + { + "epoch": 4.093564088696593, + "grad_norm": 3.4375, + "learning_rate": 0.00350819026461526, + "loss": 2.8577, + "mean_token_accuracy": 0.431502103805542, + "num_tokens": 7738210162.0, + "step": 15138 + }, + { + "epoch": 4.093834505137912, + "grad_norm": 3.421875, + "learning_rate": 0.0035073167833956586, + "loss": 2.8467, + "mean_token_accuracy": 0.4569706916809082, + "num_tokens": 7738659607.0, + "step": 15139 + }, + { + "epoch": 4.094104921579232, + "grad_norm": 3.109375, + "learning_rate": 0.003506443532073611, + "loss": 2.8045, + "mean_token_accuracy": 0.4590231478214264, + "num_tokens": 7739157271.0, + "step": 15140 + }, + { + "epoch": 4.094375338020551, + "grad_norm": 3.453125, + "learning_rate": 0.003505570510675915, + "loss": 2.7721, + "mean_token_accuracy": 0.4456931948661804, + "num_tokens": 7739642524.0, + "step": 15141 + }, + { + "epoch": 4.0946457544618715, + "grad_norm": 3.203125, + "learning_rate": 0.003504697719229355, + "loss": 2.8911, + "mean_token_accuracy": 0.4345895051956177, + "num_tokens": 7740166791.0, + "step": 15142 + }, + { + "epoch": 4.094916170903191, + "grad_norm": 3.015625, + "learning_rate": 0.0035038251577607107, + "loss": 2.6938, + "mean_token_accuracy": 0.4426119923591614, + "num_tokens": 7740691004.0, + "step": 15143 + }, + { + "epoch": 4.095186587344511, + "grad_norm": 3.046875, + "learning_rate": 0.003502952826296756, + "loss": 2.9125, + "mean_token_accuracy": 0.4486931264400482, + "num_tokens": 7741213025.0, + "step": 15144 + }, + { + "epoch": 4.09545700378583, + "grad_norm": 3.984375, + "learning_rate": 0.003502080724864256, + "loss": 3.0674, + "mean_token_accuracy": 0.4170961380004883, + "num_tokens": 7741737133.0, + "step": 15145 + }, + { + "epoch": 4.09572742022715, + "grad_norm": 3.875, + "learning_rate": 0.003501208853489968, + "loss": 2.914, + "mean_token_accuracy": 0.41050511598587036, + "num_tokens": 7742261306.0, + "step": 15146 + }, + { + "epoch": 4.095997836668469, + "grad_norm": 4.03125, + "learning_rate": 0.003500337212200644, + "loss": 2.6562, + "mean_token_accuracy": 0.4386689066886902, + "num_tokens": 7742785549.0, + "step": 15147 + }, + { + "epoch": 4.096268253109789, + "grad_norm": 3.109375, + "learning_rate": 0.0034994658010230324, + "loss": 2.9327, + "mean_token_accuracy": 0.4301832318305969, + "num_tokens": 7743309789.0, + "step": 15148 + }, + { + "epoch": 4.096538669551109, + "grad_norm": 3.984375, + "learning_rate": 0.003498594619983864, + "loss": 3.0425, + "mean_token_accuracy": 0.4038936495780945, + "num_tokens": 7743834021.0, + "step": 15149 + }, + { + "epoch": 4.096809085992429, + "grad_norm": 3.671875, + "learning_rate": 0.0034977236691098768, + "loss": 2.6827, + "mean_token_accuracy": 0.43351835012435913, + "num_tokens": 7744358237.0, + "step": 15150 + }, + { + "epoch": 4.097079502433748, + "grad_norm": 26.875, + "learning_rate": 0.003496852948427789, + "loss": 8.9143, + "mean_token_accuracy": 0.012734293937683105, + "num_tokens": 7744797002.0, + "step": 15151 + }, + { + "epoch": 4.097349918875068, + "grad_norm": 6.34375, + "learning_rate": 0.003495982457964316, + "loss": 2.9274, + "mean_token_accuracy": 0.43038463592529297, + "num_tokens": 7745276272.0, + "step": 15152 + }, + { + "epoch": 4.097620335316387, + "grad_norm": 3.0, + "learning_rate": 0.0034951121977461717, + "loss": 2.8958, + "mean_token_accuracy": 0.4367779791355133, + "num_tokens": 7745792663.0, + "step": 15153 + }, + { + "epoch": 4.097890751757707, + "grad_norm": 3.59375, + "learning_rate": 0.0034942421678000536, + "loss": 2.8153, + "mean_token_accuracy": 0.4364272356033325, + "num_tokens": 7746262738.0, + "step": 15154 + }, + { + "epoch": 4.098161168199026, + "grad_norm": 3.9375, + "learning_rate": 0.0034933723681526614, + "loss": 2.8077, + "mean_token_accuracy": 0.42469871044158936, + "num_tokens": 7746787008.0, + "step": 15155 + }, + { + "epoch": 4.0984315846403465, + "grad_norm": 3.78125, + "learning_rate": 0.003492502798830679, + "loss": 2.9641, + "mean_token_accuracy": 0.4399643540382385, + "num_tokens": 7747268692.0, + "step": 15156 + }, + { + "epoch": 4.098702001081666, + "grad_norm": 3.828125, + "learning_rate": 0.003491633459860788, + "loss": 2.786, + "mean_token_accuracy": 0.4407927095890045, + "num_tokens": 7747792872.0, + "step": 15157 + }, + { + "epoch": 4.098972417522986, + "grad_norm": 4.5, + "learning_rate": 0.003490764351269665, + "loss": 2.7431, + "mean_token_accuracy": 0.437923789024353, + "num_tokens": 7748317138.0, + "step": 15158 + }, + { + "epoch": 4.099242833964305, + "grad_norm": 3.96875, + "learning_rate": 0.003489895473083975, + "loss": 2.9302, + "mean_token_accuracy": 0.41166597604751587, + "num_tokens": 7748841210.0, + "step": 15159 + }, + { + "epoch": 4.099513250405625, + "grad_norm": 3.234375, + "learning_rate": 0.003489026825330375, + "loss": 2.776, + "mean_token_accuracy": 0.4487317204475403, + "num_tokens": 7749365375.0, + "step": 15160 + }, + { + "epoch": 4.099783666846944, + "grad_norm": 4.0, + "learning_rate": 0.0034881584080355224, + "loss": 2.8651, + "mean_token_accuracy": 0.43392324447631836, + "num_tokens": 7749884075.0, + "step": 15161 + }, + { + "epoch": 4.1000540832882635, + "grad_norm": 3.71875, + "learning_rate": 0.0034872902212260593, + "loss": 2.8073, + "mean_token_accuracy": 0.44819048047065735, + "num_tokens": 7750400866.0, + "step": 15162 + }, + { + "epoch": 4.100324499729584, + "grad_norm": 3.84375, + "learning_rate": 0.003486422264928625, + "loss": 2.9636, + "mean_token_accuracy": 0.4324880838394165, + "num_tokens": 7750925089.0, + "step": 15163 + }, + { + "epoch": 4.100594916170903, + "grad_norm": 3.796875, + "learning_rate": 0.003485554539169853, + "loss": 2.8568, + "mean_token_accuracy": 0.44570988416671753, + "num_tokens": 7751449294.0, + "step": 15164 + }, + { + "epoch": 4.100865332612223, + "grad_norm": 3.890625, + "learning_rate": 0.003484687043976368, + "loss": 2.6942, + "mean_token_accuracy": 0.43123725056648254, + "num_tokens": 7751973365.0, + "step": 15165 + }, + { + "epoch": 4.101135749053542, + "grad_norm": 2.640625, + "learning_rate": 0.0034838197793747826, + "loss": 2.7544, + "mean_token_accuracy": 0.45046165585517883, + "num_tokens": 7752497399.0, + "step": 15166 + }, + { + "epoch": 4.101406165494862, + "grad_norm": 4.15625, + "learning_rate": 0.003482952745391712, + "loss": 2.8796, + "mean_token_accuracy": 0.4142320454120636, + "num_tokens": 7753021558.0, + "step": 15167 + }, + { + "epoch": 4.101676581936181, + "grad_norm": 3.15625, + "learning_rate": 0.0034820859420537577, + "loss": 2.8644, + "mean_token_accuracy": 0.4432803988456726, + "num_tokens": 7753491192.0, + "step": 15168 + }, + { + "epoch": 4.101946998377501, + "grad_norm": 3.90625, + "learning_rate": 0.003481219369387514, + "loss": 2.5843, + "mean_token_accuracy": 0.45194369554519653, + "num_tokens": 7754015464.0, + "step": 15169 + }, + { + "epoch": 4.102217414818821, + "grad_norm": 3.0625, + "learning_rate": 0.003480353027419572, + "loss": 2.7827, + "mean_token_accuracy": 0.4585033059120178, + "num_tokens": 7754539624.0, + "step": 15170 + }, + { + "epoch": 4.102487831260141, + "grad_norm": 6.625, + "learning_rate": 0.003479486916176514, + "loss": 8.0546, + "mean_token_accuracy": 0.035060979425907135, + "num_tokens": 7755063795.0, + "step": 15171 + }, + { + "epoch": 4.10275824770146, + "grad_norm": 7.6875, + "learning_rate": 0.00347862103568491, + "loss": 2.9796, + "mean_token_accuracy": 0.3975495994091034, + "num_tokens": 7755569185.0, + "step": 15172 + }, + { + "epoch": 4.10302866414278, + "grad_norm": 2.796875, + "learning_rate": 0.0034777553859713364, + "loss": 2.8442, + "mean_token_accuracy": 0.4261873960494995, + "num_tokens": 7756093405.0, + "step": 15173 + }, + { + "epoch": 4.103299080584099, + "grad_norm": 3.140625, + "learning_rate": 0.003476889967062344, + "loss": 2.8184, + "mean_token_accuracy": 0.4296911060810089, + "num_tokens": 7756617579.0, + "step": 15174 + }, + { + "epoch": 4.103569497025419, + "grad_norm": 3.453125, + "learning_rate": 0.0034760247789844957, + "loss": 2.6508, + "mean_token_accuracy": 0.44849950075149536, + "num_tokens": 7757083275.0, + "step": 15175 + }, + { + "epoch": 4.1038399134667385, + "grad_norm": 2.984375, + "learning_rate": 0.003475159821764331, + "loss": 2.9657, + "mean_token_accuracy": 0.431143581867218, + "num_tokens": 7757607404.0, + "step": 15176 + }, + { + "epoch": 4.104110329908059, + "grad_norm": 3.53125, + "learning_rate": 0.0034742950954283927, + "loss": 2.7633, + "mean_token_accuracy": 0.43079131841659546, + "num_tokens": 7758131674.0, + "step": 15177 + }, + { + "epoch": 4.104380746349378, + "grad_norm": 4.0625, + "learning_rate": 0.0034734306000032134, + "loss": 2.6072, + "mean_token_accuracy": 0.47774380445480347, + "num_tokens": 7758655834.0, + "step": 15178 + }, + { + "epoch": 4.104651162790698, + "grad_norm": 2.859375, + "learning_rate": 0.003472566335515318, + "loss": 2.6896, + "mean_token_accuracy": 0.451740026473999, + "num_tokens": 7759180072.0, + "step": 15179 + }, + { + "epoch": 4.104921579232017, + "grad_norm": 2.984375, + "learning_rate": 0.003471702301991222, + "loss": 2.8686, + "mean_token_accuracy": 0.4351862967014313, + "num_tokens": 7759704264.0, + "step": 15180 + }, + { + "epoch": 4.105191995673337, + "grad_norm": 3.484375, + "learning_rate": 0.0034708384994574415, + "loss": 2.9115, + "mean_token_accuracy": 0.43381768465042114, + "num_tokens": 7760228456.0, + "step": 15181 + }, + { + "epoch": 4.105462412114656, + "grad_norm": 3.4375, + "learning_rate": 0.003469974927940477, + "loss": 2.8603, + "mean_token_accuracy": 0.44595447182655334, + "num_tokens": 7760750831.0, + "step": 15182 + }, + { + "epoch": 4.105732828555976, + "grad_norm": 5.0625, + "learning_rate": 0.0034691115874668243, + "loss": 2.7441, + "mean_token_accuracy": 0.45604458451271057, + "num_tokens": 7761215087.0, + "step": 15183 + }, + { + "epoch": 4.106003244997296, + "grad_norm": 2.6875, + "learning_rate": 0.0034682484780629765, + "loss": 2.7865, + "mean_token_accuracy": 0.45587483048439026, + "num_tokens": 7761719976.0, + "step": 15184 + }, + { + "epoch": 4.106273661438616, + "grad_norm": 3.84375, + "learning_rate": 0.0034673855997554154, + "loss": 2.7913, + "mean_token_accuracy": 0.4291993975639343, + "num_tokens": 7762244173.0, + "step": 15185 + }, + { + "epoch": 4.106544077879935, + "grad_norm": 2.640625, + "learning_rate": 0.003466522952570613, + "loss": 2.8664, + "mean_token_accuracy": 0.4521472752094269, + "num_tokens": 7762705110.0, + "step": 15186 + }, + { + "epoch": 4.106814494321255, + "grad_norm": 3.65625, + "learning_rate": 0.003465660536535044, + "loss": 2.7361, + "mean_token_accuracy": 0.4598396420478821, + "num_tokens": 7763229361.0, + "step": 15187 + }, + { + "epoch": 4.107084910762574, + "grad_norm": 3.21875, + "learning_rate": 0.0034647983516751676, + "loss": 2.9464, + "mean_token_accuracy": 0.43516629934310913, + "num_tokens": 7763705077.0, + "step": 15188 + }, + { + "epoch": 4.107355327203894, + "grad_norm": 3.3125, + "learning_rate": 0.003463936398017433, + "loss": 2.9037, + "mean_token_accuracy": 0.43485474586486816, + "num_tokens": 7764229258.0, + "step": 15189 + }, + { + "epoch": 4.1076257436452135, + "grad_norm": 3.546875, + "learning_rate": 0.0034630746755882937, + "loss": 2.7977, + "mean_token_accuracy": 0.4637152850627899, + "num_tokens": 7764688028.0, + "step": 15190 + }, + { + "epoch": 4.107896160086534, + "grad_norm": 54.5, + "learning_rate": 0.0034622131844141892, + "loss": 6.2527, + "mean_token_accuracy": 0.09010006487369537, + "num_tokens": 7765187799.0, + "step": 15191 + }, + { + "epoch": 4.108166576527853, + "grad_norm": 8.0625, + "learning_rate": 0.0034613519245215497, + "loss": 2.6173, + "mean_token_accuracy": 0.4567584991455078, + "num_tokens": 7765711935.0, + "step": 15192 + }, + { + "epoch": 4.108436992969173, + "grad_norm": 3.375, + "learning_rate": 0.003460490895936804, + "loss": 2.9617, + "mean_token_accuracy": 0.44209960103034973, + "num_tokens": 7766172896.0, + "step": 15193 + }, + { + "epoch": 4.108707409410492, + "grad_norm": 4.9375, + "learning_rate": 0.0034596300986863716, + "loss": 2.74, + "mean_token_accuracy": 0.43088269233703613, + "num_tokens": 7766697113.0, + "step": 15194 + }, + { + "epoch": 4.108977825851812, + "grad_norm": 3.25, + "learning_rate": 0.0034587695327966596, + "loss": 2.9367, + "mean_token_accuracy": 0.4331110417842865, + "num_tokens": 7767193555.0, + "step": 15195 + }, + { + "epoch": 4.109248242293131, + "grad_norm": 5.25, + "learning_rate": 0.003457909198294077, + "loss": 2.7297, + "mean_token_accuracy": 0.4672936201095581, + "num_tokens": 7767717747.0, + "step": 15196 + }, + { + "epoch": 4.109518658734451, + "grad_norm": 3.40625, + "learning_rate": 0.0034570490952050216, + "loss": 2.6131, + "mean_token_accuracy": 0.4532231092453003, + "num_tokens": 7768242003.0, + "step": 15197 + }, + { + "epoch": 4.109789075175771, + "grad_norm": 4.84375, + "learning_rate": 0.003456189223555879, + "loss": 2.6938, + "mean_token_accuracy": 0.44120895862579346, + "num_tokens": 7768766088.0, + "step": 15198 + }, + { + "epoch": 4.110059491617091, + "grad_norm": 3.46875, + "learning_rate": 0.003455329583373038, + "loss": 2.8187, + "mean_token_accuracy": 0.4378341734409332, + "num_tokens": 7769290323.0, + "step": 15199 + }, + { + "epoch": 4.11032990805841, + "grad_norm": 4.28125, + "learning_rate": 0.0034544701746828717, + "loss": 2.9214, + "mean_token_accuracy": 0.43336987495422363, + "num_tokens": 7769814281.0, + "step": 15200 + }, + { + "epoch": 4.11060032449973, + "grad_norm": 3.5, + "learning_rate": 0.0034536109975117518, + "loss": 2.6215, + "mean_token_accuracy": 0.457428514957428, + "num_tokens": 7770338389.0, + "step": 15201 + }, + { + "epoch": 4.110870740941049, + "grad_norm": 3.640625, + "learning_rate": 0.0034527520518860388, + "loss": 2.8066, + "mean_token_accuracy": 0.44017988443374634, + "num_tokens": 7770862276.0, + "step": 15202 + }, + { + "epoch": 4.111141157382368, + "grad_norm": 2.96875, + "learning_rate": 0.0034518933378320862, + "loss": 2.7055, + "mean_token_accuracy": 0.43877363204956055, + "num_tokens": 7771386399.0, + "step": 15203 + }, + { + "epoch": 4.1114115738236885, + "grad_norm": 3.5, + "learning_rate": 0.0034510348553762438, + "loss": 2.8458, + "mean_token_accuracy": 0.4122627377510071, + "num_tokens": 7771910576.0, + "step": 15204 + }, + { + "epoch": 4.111681990265008, + "grad_norm": 3.40625, + "learning_rate": 0.0034501766045448535, + "loss": 2.7754, + "mean_token_accuracy": 0.43646639585494995, + "num_tokens": 7772434845.0, + "step": 15205 + }, + { + "epoch": 4.111952406706328, + "grad_norm": 3.640625, + "learning_rate": 0.003449318585364246, + "loss": 2.8825, + "mean_token_accuracy": 0.43494054675102234, + "num_tokens": 7772958970.0, + "step": 15206 + }, + { + "epoch": 4.112222823147647, + "grad_norm": 4.1875, + "learning_rate": 0.0034484607978607505, + "loss": 2.9233, + "mean_token_accuracy": 0.4393024444580078, + "num_tokens": 7773458036.0, + "step": 15207 + }, + { + "epoch": 4.112493239588967, + "grad_norm": 4.0625, + "learning_rate": 0.003447603242060686, + "loss": 2.8952, + "mean_token_accuracy": 0.43606212735176086, + "num_tokens": 7773982156.0, + "step": 15208 + }, + { + "epoch": 4.112763656030286, + "grad_norm": 27.75, + "learning_rate": 0.0034467459179903627, + "loss": 2.7915, + "mean_token_accuracy": 0.44333651661872864, + "num_tokens": 7774417498.0, + "step": 15209 + }, + { + "epoch": 4.113034072471606, + "grad_norm": 6.65625, + "learning_rate": 0.003445888825676089, + "loss": 3.05, + "mean_token_accuracy": 0.38011783361434937, + "num_tokens": 7774941714.0, + "step": 15210 + }, + { + "epoch": 4.1133044889129255, + "grad_norm": 334.0, + "learning_rate": 0.0034450319651441606, + "loss": 15.9427, + "mean_token_accuracy": 0.0011430592276155949, + "num_tokens": 7775430769.0, + "step": 15211 + }, + { + "epoch": 4.113574905354246, + "grad_norm": 6.15625, + "learning_rate": 0.0034441753364208684, + "loss": 2.8171, + "mean_token_accuracy": 0.45718836784362793, + "num_tokens": 7775870441.0, + "step": 15212 + }, + { + "epoch": 4.113845321795565, + "grad_norm": 2.625, + "learning_rate": 0.003443318939532498, + "loss": 2.8664, + "mean_token_accuracy": 0.43280723690986633, + "num_tokens": 7776394663.0, + "step": 15213 + }, + { + "epoch": 4.114115738236885, + "grad_norm": 3.59375, + "learning_rate": 0.003442462774505326, + "loss": 2.8572, + "mean_token_accuracy": 0.43906378746032715, + "num_tokens": 7776883574.0, + "step": 15214 + }, + { + "epoch": 4.114386154678204, + "grad_norm": 3.6875, + "learning_rate": 0.003441606841365619, + "loss": 2.7235, + "mean_token_accuracy": 0.43585386872291565, + "num_tokens": 7777407699.0, + "step": 15215 + }, + { + "epoch": 4.114656571119524, + "grad_norm": 3.421875, + "learning_rate": 0.003440751140139644, + "loss": 2.6815, + "mean_token_accuracy": 0.4435080885887146, + "num_tokens": 7777931967.0, + "step": 15216 + }, + { + "epoch": 4.114926987560843, + "grad_norm": 4.03125, + "learning_rate": 0.0034398956708536522, + "loss": 2.7334, + "mean_token_accuracy": 0.43608784675598145, + "num_tokens": 7778424830.0, + "step": 15217 + }, + { + "epoch": 4.1151974040021635, + "grad_norm": 3.34375, + "learning_rate": 0.003439040433533897, + "loss": 2.9208, + "mean_token_accuracy": 0.43047839403152466, + "num_tokens": 7778949066.0, + "step": 15218 + }, + { + "epoch": 4.115467820443483, + "grad_norm": 4.5625, + "learning_rate": 0.0034381854282066137, + "loss": 2.9553, + "mean_token_accuracy": 0.42944422364234924, + "num_tokens": 7779473263.0, + "step": 15219 + }, + { + "epoch": 4.115738236884803, + "grad_norm": 3.53125, + "learning_rate": 0.003437330654898041, + "loss": 2.6898, + "mean_token_accuracy": 0.4622626304626465, + "num_tokens": 7779903839.0, + "step": 15220 + }, + { + "epoch": 4.116008653326122, + "grad_norm": 3.703125, + "learning_rate": 0.003436476113634403, + "loss": 2.9337, + "mean_token_accuracy": 0.43683499097824097, + "num_tokens": 7780428075.0, + "step": 15221 + }, + { + "epoch": 4.116279069767442, + "grad_norm": 14.9375, + "learning_rate": 0.0034356218044419217, + "loss": 2.8782, + "mean_token_accuracy": 0.43007320165634155, + "num_tokens": 7780952214.0, + "step": 15222 + }, + { + "epoch": 4.116549486208761, + "grad_norm": 3.09375, + "learning_rate": 0.0034347677273468074, + "loss": 2.6089, + "mean_token_accuracy": 0.4593648910522461, + "num_tokens": 7781460407.0, + "step": 15223 + }, + { + "epoch": 4.116819902650081, + "grad_norm": 3.078125, + "learning_rate": 0.0034339138823752703, + "loss": 2.7628, + "mean_token_accuracy": 0.43134814500808716, + "num_tokens": 7781984512.0, + "step": 15224 + }, + { + "epoch": 4.1170903190914006, + "grad_norm": 3.59375, + "learning_rate": 0.0034330602695535036, + "loss": 2.7298, + "mean_token_accuracy": 0.43365365266799927, + "num_tokens": 7782508643.0, + "step": 15225 + }, + { + "epoch": 4.117360735532721, + "grad_norm": 3.40625, + "learning_rate": 0.0034322068889076998, + "loss": 2.7794, + "mean_token_accuracy": 0.43637770414352417, + "num_tokens": 7783032767.0, + "step": 15226 + }, + { + "epoch": 4.11763115197404, + "grad_norm": 3.90625, + "learning_rate": 0.003431353740464046, + "loss": 3.0714, + "mean_token_accuracy": 0.43176019191741943, + "num_tokens": 7783557044.0, + "step": 15227 + }, + { + "epoch": 4.11790156841536, + "grad_norm": 3.59375, + "learning_rate": 0.003430500824248718, + "loss": 2.8569, + "mean_token_accuracy": 0.44259995222091675, + "num_tokens": 7783994317.0, + "step": 15228 + }, + { + "epoch": 4.118171984856679, + "grad_norm": 2.984375, + "learning_rate": 0.003429648140287882, + "loss": 2.7044, + "mean_token_accuracy": 0.4509645700454712, + "num_tokens": 7784461682.0, + "step": 15229 + }, + { + "epoch": 4.118442401297999, + "grad_norm": 3.515625, + "learning_rate": 0.0034287956886077066, + "loss": 2.6925, + "mean_token_accuracy": 0.4352414011955261, + "num_tokens": 7784979274.0, + "step": 15230 + }, + { + "epoch": 4.118712817739318, + "grad_norm": 65.0, + "learning_rate": 0.003427943469234345, + "loss": 8.0211, + "mean_token_accuracy": 0.05485802888870239, + "num_tokens": 7785503366.0, + "step": 15231 + }, + { + "epoch": 4.1189832341806385, + "grad_norm": 5.625, + "learning_rate": 0.003427091482193944, + "loss": 3.0635, + "mean_token_accuracy": 0.44287601113319397, + "num_tokens": 7785976114.0, + "step": 15232 + }, + { + "epoch": 4.119253650621958, + "grad_norm": 3.015625, + "learning_rate": 0.0034262397275126454, + "loss": 2.7799, + "mean_token_accuracy": 0.44279971718788147, + "num_tokens": 7786489574.0, + "step": 15233 + }, + { + "epoch": 4.119524067063278, + "grad_norm": 3.015625, + "learning_rate": 0.0034253882052165873, + "loss": 2.8055, + "mean_token_accuracy": 0.454539954662323, + "num_tokens": 7786980013.0, + "step": 15234 + }, + { + "epoch": 4.119794483504597, + "grad_norm": 3.484375, + "learning_rate": 0.0034245369153318938, + "loss": 2.7811, + "mean_token_accuracy": 0.4436376094818115, + "num_tokens": 7787491704.0, + "step": 15235 + }, + { + "epoch": 4.120064899945917, + "grad_norm": 3.171875, + "learning_rate": 0.0034236858578846874, + "loss": 3.0467, + "mean_token_accuracy": 0.41986674070358276, + "num_tokens": 7788015986.0, + "step": 15236 + }, + { + "epoch": 4.120335316387236, + "grad_norm": 2.96875, + "learning_rate": 0.003422835032901079, + "loss": 2.6799, + "mean_token_accuracy": 0.4367198348045349, + "num_tokens": 7788501659.0, + "step": 15237 + }, + { + "epoch": 4.120605732828556, + "grad_norm": 2.671875, + "learning_rate": 0.0034219844404071733, + "loss": 2.6628, + "mean_token_accuracy": 0.45649254322052, + "num_tokens": 7789025929.0, + "step": 15238 + }, + { + "epoch": 4.1208761492698756, + "grad_norm": 3.109375, + "learning_rate": 0.0034211340804290715, + "loss": 2.8937, + "mean_token_accuracy": 0.4359230697154999, + "num_tokens": 7789550188.0, + "step": 15239 + }, + { + "epoch": 4.121146565711196, + "grad_norm": 3.484375, + "learning_rate": 0.0034202839529928653, + "loss": 3.0346, + "mean_token_accuracy": 0.42320290207862854, + "num_tokens": 7790025526.0, + "step": 15240 + }, + { + "epoch": 4.121416982152515, + "grad_norm": 3.09375, + "learning_rate": 0.003419434058124635, + "loss": 2.9003, + "mean_token_accuracy": 0.427303671836853, + "num_tokens": 7790549793.0, + "step": 15241 + }, + { + "epoch": 4.121687398593835, + "grad_norm": 3.375, + "learning_rate": 0.0034185843958504623, + "loss": 2.6977, + "mean_token_accuracy": 0.43714720010757446, + "num_tokens": 7791073980.0, + "step": 15242 + }, + { + "epoch": 4.121957815035154, + "grad_norm": 3.203125, + "learning_rate": 0.0034177349661964162, + "loss": 2.8257, + "mean_token_accuracy": 0.42418891191482544, + "num_tokens": 7791598018.0, + "step": 15243 + }, + { + "epoch": 4.122228231476473, + "grad_norm": 3.265625, + "learning_rate": 0.003416885769188559, + "loss": 2.8369, + "mean_token_accuracy": 0.4402596354484558, + "num_tokens": 7792122294.0, + "step": 15244 + }, + { + "epoch": 4.122498647917793, + "grad_norm": 3.859375, + "learning_rate": 0.0034160368048529476, + "loss": 2.7683, + "mean_token_accuracy": 0.45742589235305786, + "num_tokens": 7792606119.0, + "step": 15245 + }, + { + "epoch": 4.122769064359113, + "grad_norm": 3.453125, + "learning_rate": 0.003415188073215628, + "loss": 3.0364, + "mean_token_accuracy": 0.43243175745010376, + "num_tokens": 7793116642.0, + "step": 15246 + }, + { + "epoch": 4.123039480800433, + "grad_norm": 3.59375, + "learning_rate": 0.003414339574302643, + "loss": 2.814, + "mean_token_accuracy": 0.42202112078666687, + "num_tokens": 7793640759.0, + "step": 15247 + }, + { + "epoch": 4.123309897241752, + "grad_norm": 3.5625, + "learning_rate": 0.003413491308140031, + "loss": 2.4908, + "mean_token_accuracy": 0.4636126756668091, + "num_tokens": 7794164982.0, + "step": 15248 + }, + { + "epoch": 4.123580313683072, + "grad_norm": 3.796875, + "learning_rate": 0.0034126432747538134, + "loss": 2.6977, + "mean_token_accuracy": 0.46137410402297974, + "num_tokens": 7794689261.0, + "step": 15249 + }, + { + "epoch": 4.123850730124391, + "grad_norm": 4.1875, + "learning_rate": 0.0034117954741700158, + "loss": 2.9058, + "mean_token_accuracy": 0.43756920099258423, + "num_tokens": 7795213456.0, + "step": 15250 + }, + { + "epoch": 4.124121146565711, + "grad_norm": 164.0, + "learning_rate": 0.0034109479064146474, + "loss": 8.3364, + "mean_token_accuracy": 0.019877247512340546, + "num_tokens": 7795737634.0, + "step": 15251 + }, + { + "epoch": 4.1243915630070305, + "grad_norm": 6.25, + "learning_rate": 0.0034101005715137136, + "loss": 2.8472, + "mean_token_accuracy": 0.43173137307167053, + "num_tokens": 7796241957.0, + "step": 15252 + }, + { + "epoch": 4.124661979448351, + "grad_norm": 2.8125, + "learning_rate": 0.0034092534694932174, + "loss": 2.7352, + "mean_token_accuracy": 0.4502362608909607, + "num_tokens": 7796766049.0, + "step": 15253 + }, + { + "epoch": 4.12493239588967, + "grad_norm": 3.4375, + "learning_rate": 0.0034084066003791475, + "loss": 2.676, + "mean_token_accuracy": 0.45593249797821045, + "num_tokens": 7797290112.0, + "step": 15254 + }, + { + "epoch": 4.12520281233099, + "grad_norm": 2.8125, + "learning_rate": 0.003407559964197486, + "loss": 2.9535, + "mean_token_accuracy": 0.44230031967163086, + "num_tokens": 7797814321.0, + "step": 15255 + }, + { + "epoch": 4.125473228772309, + "grad_norm": 4.21875, + "learning_rate": 0.0034067135609742163, + "loss": 2.5971, + "mean_token_accuracy": 0.4490552544593811, + "num_tokens": 7798338423.0, + "step": 15256 + }, + { + "epoch": 4.125743645213629, + "grad_norm": 2.71875, + "learning_rate": 0.003405867390735302, + "loss": 2.8434, + "mean_token_accuracy": 0.4501575529575348, + "num_tokens": 7798845980.0, + "step": 15257 + }, + { + "epoch": 4.126014061654948, + "grad_norm": 3.453125, + "learning_rate": 0.003405021453506709, + "loss": 2.6172, + "mean_token_accuracy": 0.448400616645813, + "num_tokens": 7799370187.0, + "step": 15258 + }, + { + "epoch": 4.126284478096268, + "grad_norm": 2.71875, + "learning_rate": 0.0034041757493143954, + "loss": 2.7716, + "mean_token_accuracy": 0.4519878029823303, + "num_tokens": 7799894448.0, + "step": 15259 + }, + { + "epoch": 4.126554894537588, + "grad_norm": 3.3125, + "learning_rate": 0.0034033302781843054, + "loss": 2.8456, + "mean_token_accuracy": 0.42510104179382324, + "num_tokens": 7800418614.0, + "step": 15260 + }, + { + "epoch": 4.126825310978908, + "grad_norm": 8.5, + "learning_rate": 0.003402485040142384, + "loss": 2.699, + "mean_token_accuracy": 0.4721890687942505, + "num_tokens": 7800942851.0, + "step": 15261 + }, + { + "epoch": 4.127095727420227, + "grad_norm": 2.609375, + "learning_rate": 0.003401640035214563, + "loss": 2.7744, + "mean_token_accuracy": 0.45097893476486206, + "num_tokens": 7801467130.0, + "step": 15262 + }, + { + "epoch": 4.127366143861547, + "grad_norm": 3.765625, + "learning_rate": 0.0034007952634267737, + "loss": 2.7447, + "mean_token_accuracy": 0.44231361150741577, + "num_tokens": 7801991405.0, + "step": 15263 + }, + { + "epoch": 4.127636560302866, + "grad_norm": 3.6875, + "learning_rate": 0.003399950724804931, + "loss": 2.8676, + "mean_token_accuracy": 0.43486151099205017, + "num_tokens": 7802515435.0, + "step": 15264 + }, + { + "epoch": 4.127906976744186, + "grad_norm": 3.578125, + "learning_rate": 0.003399106419374952, + "loss": 2.9282, + "mean_token_accuracy": 0.43696361780166626, + "num_tokens": 7803039633.0, + "step": 15265 + }, + { + "epoch": 4.1281773931855055, + "grad_norm": 3.234375, + "learning_rate": 0.0033982623471627412, + "loss": 2.7738, + "mean_token_accuracy": 0.4400525987148285, + "num_tokens": 7803563893.0, + "step": 15266 + }, + { + "epoch": 4.128447809626826, + "grad_norm": 3.6875, + "learning_rate": 0.0033974185081941943, + "loss": 2.838, + "mean_token_accuracy": 0.4330531358718872, + "num_tokens": 7804088112.0, + "step": 15267 + }, + { + "epoch": 4.128718226068145, + "grad_norm": 3.875, + "learning_rate": 0.0033965749024952073, + "loss": 2.9577, + "mean_token_accuracy": 0.4388492703437805, + "num_tokens": 7804553471.0, + "step": 15268 + }, + { + "epoch": 4.128988642509465, + "grad_norm": 3.75, + "learning_rate": 0.0033957315300916615, + "loss": 2.7978, + "mean_token_accuracy": 0.46218061447143555, + "num_tokens": 7805012897.0, + "step": 15269 + }, + { + "epoch": 4.129259058950784, + "grad_norm": 5.96875, + "learning_rate": 0.003394888391009436, + "loss": 2.6714, + "mean_token_accuracy": 0.508265495300293, + "num_tokens": 7805474777.0, + "step": 15270 + }, + { + "epoch": 4.129529475392104, + "grad_norm": 158.0, + "learning_rate": 0.0033940454852744006, + "loss": 8.3606, + "mean_token_accuracy": 0.06923043727874756, + "num_tokens": 7805989307.0, + "step": 15271 + }, + { + "epoch": 4.129799891833423, + "grad_norm": 4.3125, + "learning_rate": 0.003393202812912415, + "loss": 2.7415, + "mean_token_accuracy": 0.4328746497631073, + "num_tokens": 7806513577.0, + "step": 15272 + }, + { + "epoch": 4.130070308274743, + "grad_norm": 2.671875, + "learning_rate": 0.00339236037394934, + "loss": 2.9192, + "mean_token_accuracy": 0.4422915577888489, + "num_tokens": 7806985992.0, + "step": 15273 + }, + { + "epoch": 4.130340724716063, + "grad_norm": 3.140625, + "learning_rate": 0.0033915181684110187, + "loss": 2.7537, + "mean_token_accuracy": 0.4338582456111908, + "num_tokens": 7807509993.0, + "step": 15274 + }, + { + "epoch": 4.130611141157383, + "grad_norm": 4.0, + "learning_rate": 0.0033906761963232966, + "loss": 2.501, + "mean_token_accuracy": 0.4957939088344574, + "num_tokens": 7808034180.0, + "step": 15275 + }, + { + "epoch": 4.130881557598702, + "grad_norm": 2.859375, + "learning_rate": 0.0033898344577120046, + "loss": 2.7434, + "mean_token_accuracy": 0.4262310862541199, + "num_tokens": 7808558340.0, + "step": 15276 + }, + { + "epoch": 4.131151974040022, + "grad_norm": 3.890625, + "learning_rate": 0.0033889929526029726, + "loss": 2.7757, + "mean_token_accuracy": 0.43675678968429565, + "num_tokens": 7809082580.0, + "step": 15277 + }, + { + "epoch": 4.131422390481341, + "grad_norm": 3.703125, + "learning_rate": 0.003388151681022019, + "loss": 3.0154, + "mean_token_accuracy": 0.4225342869758606, + "num_tokens": 7809606840.0, + "step": 15278 + }, + { + "epoch": 4.131692806922661, + "grad_norm": 3.578125, + "learning_rate": 0.0033873106429949586, + "loss": 2.766, + "mean_token_accuracy": 0.4377361834049225, + "num_tokens": 7810131018.0, + "step": 15279 + }, + { + "epoch": 4.1319632233639805, + "grad_norm": 3.875, + "learning_rate": 0.0033864698385475947, + "loss": 2.8042, + "mean_token_accuracy": 0.432423859834671, + "num_tokens": 7810655220.0, + "step": 15280 + }, + { + "epoch": 4.132233639805301, + "grad_norm": 3.796875, + "learning_rate": 0.0033856292677057237, + "loss": 2.9893, + "mean_token_accuracy": 0.427262544631958, + "num_tokens": 7811141276.0, + "step": 15281 + }, + { + "epoch": 4.13250405624662, + "grad_norm": 3.578125, + "learning_rate": 0.003384788930495142, + "loss": 2.789, + "mean_token_accuracy": 0.4474494457244873, + "num_tokens": 7811652058.0, + "step": 15282 + }, + { + "epoch": 4.13277447268794, + "grad_norm": 3.28125, + "learning_rate": 0.003383948826941631, + "loss": 2.8025, + "mean_token_accuracy": 0.3947426378726959, + "num_tokens": 7812176171.0, + "step": 15283 + }, + { + "epoch": 4.133044889129259, + "grad_norm": 3.453125, + "learning_rate": 0.0033831089570709643, + "loss": 2.8612, + "mean_token_accuracy": 0.42876678705215454, + "num_tokens": 7812700346.0, + "step": 15284 + }, + { + "epoch": 4.133315305570578, + "grad_norm": 3.40625, + "learning_rate": 0.003382269320908917, + "loss": 2.5329, + "mean_token_accuracy": 0.4598936140537262, + "num_tokens": 7813170273.0, + "step": 15285 + }, + { + "epoch": 4.133585722011898, + "grad_norm": 21.875, + "learning_rate": 0.003381429918481249, + "loss": 2.9279, + "mean_token_accuracy": 0.4567320942878723, + "num_tokens": 7813648494.0, + "step": 15286 + }, + { + "epoch": 4.1338561384532175, + "grad_norm": 5.40625, + "learning_rate": 0.0033805907498137143, + "loss": 2.9501, + "mean_token_accuracy": 0.41268646717071533, + "num_tokens": 7814172770.0, + "step": 15287 + }, + { + "epoch": 4.134126554894538, + "grad_norm": 3.40625, + "learning_rate": 0.003379751814932064, + "loss": 2.5304, + "mean_token_accuracy": 0.4722081422805786, + "num_tokens": 7814696915.0, + "step": 15288 + }, + { + "epoch": 4.134396971335857, + "grad_norm": 3.453125, + "learning_rate": 0.0033789131138620364, + "loss": 3.0818, + "mean_token_accuracy": 0.40169334411621094, + "num_tokens": 7815221131.0, + "step": 15289 + }, + { + "epoch": 4.134667387777177, + "grad_norm": 4.34375, + "learning_rate": 0.0033780746466293655, + "loss": 2.8764, + "mean_token_accuracy": 0.44057247042655945, + "num_tokens": 7815713189.0, + "step": 15290 + }, + { + "epoch": 4.134937804218496, + "grad_norm": 62.75, + "learning_rate": 0.0033772364132597823, + "loss": 6.695, + "mean_token_accuracy": 0.12092676758766174, + "num_tokens": 7816237314.0, + "step": 15291 + }, + { + "epoch": 4.135208220659816, + "grad_norm": 7.65625, + "learning_rate": 0.003376398413779001, + "loss": 2.7848, + "mean_token_accuracy": 0.43990057706832886, + "num_tokens": 7816761545.0, + "step": 15292 + }, + { + "epoch": 4.135478637101135, + "grad_norm": 2.703125, + "learning_rate": 0.0033755606482127375, + "loss": 2.88, + "mean_token_accuracy": 0.4051748812198639, + "num_tokens": 7817285820.0, + "step": 15293 + }, + { + "epoch": 4.1357490535424555, + "grad_norm": 3.46875, + "learning_rate": 0.003374723116586695, + "loss": 2.8907, + "mean_token_accuracy": 0.4258631467819214, + "num_tokens": 7817810067.0, + "step": 15294 + }, + { + "epoch": 4.136019469983775, + "grad_norm": 67.5, + "learning_rate": 0.0033738858189265707, + "loss": 3.0767, + "mean_token_accuracy": 0.4414764642715454, + "num_tokens": 7818334250.0, + "step": 15295 + }, + { + "epoch": 4.136289886425095, + "grad_norm": 5.6875, + "learning_rate": 0.003373048755258058, + "loss": 2.9541, + "mean_token_accuracy": 0.4223077893257141, + "num_tokens": 7818858476.0, + "step": 15296 + }, + { + "epoch": 4.136560302866414, + "grad_norm": 3.546875, + "learning_rate": 0.0033722119256068385, + "loss": 2.9163, + "mean_token_accuracy": 0.42213431000709534, + "num_tokens": 7819382579.0, + "step": 15297 + }, + { + "epoch": 4.136830719307734, + "grad_norm": 3.765625, + "learning_rate": 0.0033713753299985873, + "loss": 2.9084, + "mean_token_accuracy": 0.4401296377182007, + "num_tokens": 7819857905.0, + "step": 15298 + }, + { + "epoch": 4.137101135749053, + "grad_norm": 4.5, + "learning_rate": 0.0033705389684589773, + "loss": 2.6776, + "mean_token_accuracy": 0.46404558420181274, + "num_tokens": 7820295813.0, + "step": 15299 + }, + { + "epoch": 4.137371552190373, + "grad_norm": 3.109375, + "learning_rate": 0.0033697028410136676, + "loss": 2.8067, + "mean_token_accuracy": 0.42561036348342896, + "num_tokens": 7820820054.0, + "step": 15300 + }, + { + "epoch": 4.1376419686316925, + "grad_norm": 4.03125, + "learning_rate": 0.003368866947688314, + "loss": 2.639, + "mean_token_accuracy": 0.4691312909126282, + "num_tokens": 7821279584.0, + "step": 15301 + }, + { + "epoch": 4.137912385073013, + "grad_norm": 3.15625, + "learning_rate": 0.0033680312885085648, + "loss": 2.6705, + "mean_token_accuracy": 0.45303505659103394, + "num_tokens": 7821803504.0, + "step": 15302 + }, + { + "epoch": 4.138182801514332, + "grad_norm": 4.0, + "learning_rate": 0.003367195863500058, + "loss": 2.8316, + "mean_token_accuracy": 0.42083096504211426, + "num_tokens": 7822327747.0, + "step": 15303 + }, + { + "epoch": 4.138453217955652, + "grad_norm": 4.0, + "learning_rate": 0.003366360672688429, + "loss": 2.6879, + "mean_token_accuracy": 0.45776253938674927, + "num_tokens": 7822851980.0, + "step": 15304 + }, + { + "epoch": 4.138723634396971, + "grad_norm": 3.5625, + "learning_rate": 0.0033655257160993047, + "loss": 2.9128, + "mean_token_accuracy": 0.4385022521018982, + "num_tokens": 7823361908.0, + "step": 15305 + }, + { + "epoch": 4.138994050838291, + "grad_norm": 3.671875, + "learning_rate": 0.0033646909937583036, + "loss": 2.7615, + "mean_token_accuracy": 0.44362321496009827, + "num_tokens": 7823877280.0, + "step": 15306 + }, + { + "epoch": 4.13926446727961, + "grad_norm": 3.46875, + "learning_rate": 0.003363856505691035, + "loss": 2.5645, + "mean_token_accuracy": 0.4702032208442688, + "num_tokens": 7824344832.0, + "step": 15307 + }, + { + "epoch": 4.1395348837209305, + "grad_norm": 3.078125, + "learning_rate": 0.0033630222519231067, + "loss": 2.7391, + "mean_token_accuracy": 0.4472863972187042, + "num_tokens": 7824868996.0, + "step": 15308 + }, + { + "epoch": 4.13980530016225, + "grad_norm": 4.15625, + "learning_rate": 0.0033621882324801145, + "loss": 2.7165, + "mean_token_accuracy": 0.4449313282966614, + "num_tokens": 7825393095.0, + "step": 15309 + }, + { + "epoch": 4.14007571660357, + "grad_norm": 3.546875, + "learning_rate": 0.0033613544473876478, + "loss": 2.9361, + "mean_token_accuracy": 0.4286213517189026, + "num_tokens": 7825917332.0, + "step": 15310 + }, + { + "epoch": 4.140346133044889, + "grad_norm": 126.0, + "learning_rate": 0.003360520896671292, + "loss": 6.7346, + "mean_token_accuracy": 0.1334446370601654, + "num_tokens": 7826441520.0, + "step": 15311 + }, + { + "epoch": 4.140616549486209, + "grad_norm": 7.0625, + "learning_rate": 0.0033596875803566214, + "loss": 2.9796, + "mean_token_accuracy": 0.4160078167915344, + "num_tokens": 7826965642.0, + "step": 15312 + }, + { + "epoch": 4.140886965927528, + "grad_norm": 2.734375, + "learning_rate": 0.0033588544984692013, + "loss": 2.9584, + "mean_token_accuracy": 0.4394504725933075, + "num_tokens": 7827474309.0, + "step": 15313 + }, + { + "epoch": 4.141157382368848, + "grad_norm": 3.6875, + "learning_rate": 0.0033580216510346, + "loss": 2.9623, + "mean_token_accuracy": 0.4149251878261566, + "num_tokens": 7827960801.0, + "step": 15314 + }, + { + "epoch": 4.1414277988101675, + "grad_norm": 2.109375, + "learning_rate": 0.0033571890380783656, + "loss": 2.8046, + "mean_token_accuracy": 0.4394279718399048, + "num_tokens": 7828484974.0, + "step": 15315 + }, + { + "epoch": 4.141698215251488, + "grad_norm": 2.96875, + "learning_rate": 0.0033563566596260487, + "loss": 2.7449, + "mean_token_accuracy": 0.4407251477241516, + "num_tokens": 7829009123.0, + "step": 15316 + }, + { + "epoch": 4.141968631692807, + "grad_norm": 3.421875, + "learning_rate": 0.0033555245157031862, + "loss": 2.744, + "mean_token_accuracy": 0.4450644254684448, + "num_tokens": 7829509789.0, + "step": 15317 + }, + { + "epoch": 4.142239048134127, + "grad_norm": 3.484375, + "learning_rate": 0.0033546926063353147, + "loss": 2.6688, + "mean_token_accuracy": 0.44982582330703735, + "num_tokens": 7829995023.0, + "step": 15318 + }, + { + "epoch": 4.142509464575446, + "grad_norm": 3.0, + "learning_rate": 0.0033538609315479553, + "loss": 2.6547, + "mean_token_accuracy": 0.4552982449531555, + "num_tokens": 7830519203.0, + "step": 15319 + }, + { + "epoch": 4.142779881016766, + "grad_norm": 3.171875, + "learning_rate": 0.003353029491366631, + "loss": 2.6547, + "mean_token_accuracy": 0.414524108171463, + "num_tokens": 7830981495.0, + "step": 15320 + }, + { + "epoch": 4.143050297458085, + "grad_norm": 3.578125, + "learning_rate": 0.0033521982858168487, + "loss": 2.6947, + "mean_token_accuracy": 0.44840043783187866, + "num_tokens": 7831499540.0, + "step": 15321 + }, + { + "epoch": 4.1433207138994055, + "grad_norm": 4.5625, + "learning_rate": 0.003351367314924114, + "loss": 2.7887, + "mean_token_accuracy": 0.446505606174469, + "num_tokens": 7832023744.0, + "step": 15322 + }, + { + "epoch": 4.143591130340725, + "grad_norm": 2.75, + "learning_rate": 0.0033505365787139243, + "loss": 2.9372, + "mean_token_accuracy": 0.4265512228012085, + "num_tokens": 7832548025.0, + "step": 15323 + }, + { + "epoch": 4.143861546782045, + "grad_norm": 3.21875, + "learning_rate": 0.003349706077211765, + "loss": 2.7568, + "mean_token_accuracy": 0.4393687844276428, + "num_tokens": 7833072227.0, + "step": 15324 + }, + { + "epoch": 4.144131963223364, + "grad_norm": 3.296875, + "learning_rate": 0.003348875810443125, + "loss": 2.7849, + "mean_token_accuracy": 0.4262275993824005, + "num_tokens": 7833596441.0, + "step": 15325 + }, + { + "epoch": 4.144402379664683, + "grad_norm": 3.5, + "learning_rate": 0.003348045778433474, + "loss": 2.8368, + "mean_token_accuracy": 0.42824089527130127, + "num_tokens": 7834112104.0, + "step": 15326 + }, + { + "epoch": 4.144672796106003, + "grad_norm": 2.984375, + "learning_rate": 0.003347215981208281, + "loss": 2.9157, + "mean_token_accuracy": 0.4371092915534973, + "num_tokens": 7834636258.0, + "step": 15327 + }, + { + "epoch": 4.1449432125473225, + "grad_norm": 3.953125, + "learning_rate": 0.0033463864187930083, + "loss": 2.6828, + "mean_token_accuracy": 0.45041078329086304, + "num_tokens": 7835160435.0, + "step": 15328 + }, + { + "epoch": 4.1452136289886425, + "grad_norm": 2.984375, + "learning_rate": 0.0033455570912131073, + "loss": 2.8773, + "mean_token_accuracy": 0.43543779850006104, + "num_tokens": 7835660506.0, + "step": 15329 + }, + { + "epoch": 4.145484045429962, + "grad_norm": 3.71875, + "learning_rate": 0.0033447279984940236, + "loss": 2.5925, + "mean_token_accuracy": 0.4309573769569397, + "num_tokens": 7836184753.0, + "step": 15330 + }, + { + "epoch": 4.145754461871282, + "grad_norm": 139.0, + "learning_rate": 0.0033438991406611997, + "loss": 10.6104, + "mean_token_accuracy": 0.0033957420382648706, + "num_tokens": 7836708933.0, + "step": 15331 + }, + { + "epoch": 4.146024878312601, + "grad_norm": 6.59375, + "learning_rate": 0.0033430705177400627, + "loss": 2.9019, + "mean_token_accuracy": 0.4375218152999878, + "num_tokens": 7837199492.0, + "step": 15332 + }, + { + "epoch": 4.146295294753921, + "grad_norm": 2.28125, + "learning_rate": 0.00334224212975604, + "loss": 2.848, + "mean_token_accuracy": 0.42747437953948975, + "num_tokens": 7837723520.0, + "step": 15333 + }, + { + "epoch": 4.14656571119524, + "grad_norm": 2.8125, + "learning_rate": 0.003341413976734552, + "loss": 2.8436, + "mean_token_accuracy": 0.44533711671829224, + "num_tokens": 7838247800.0, + "step": 15334 + }, + { + "epoch": 4.14683612763656, + "grad_norm": 3.328125, + "learning_rate": 0.0033405860587010046, + "loss": 2.6985, + "mean_token_accuracy": 0.47148633003234863, + "num_tokens": 7838771989.0, + "step": 15335 + }, + { + "epoch": 4.14710654407788, + "grad_norm": 3.78125, + "learning_rate": 0.0033397583756807994, + "loss": 2.7795, + "mean_token_accuracy": 0.4339001774787903, + "num_tokens": 7839296150.0, + "step": 15336 + }, + { + "epoch": 4.1473769605192, + "grad_norm": 3.21875, + "learning_rate": 0.0033389309276993374, + "loss": 2.9514, + "mean_token_accuracy": 0.41939669847488403, + "num_tokens": 7839773564.0, + "step": 15337 + }, + { + "epoch": 4.147647376960519, + "grad_norm": 3.4375, + "learning_rate": 0.0033381037147820016, + "loss": 2.625, + "mean_token_accuracy": 0.43101412057876587, + "num_tokens": 7840297678.0, + "step": 15338 + }, + { + "epoch": 4.147917793401839, + "grad_norm": 3.5625, + "learning_rate": 0.0033372767369541784, + "loss": 2.6589, + "mean_token_accuracy": 0.4440428912639618, + "num_tokens": 7840821901.0, + "step": 15339 + }, + { + "epoch": 4.148188209843158, + "grad_norm": 3.015625, + "learning_rate": 0.0033364499942412393, + "loss": 2.8588, + "mean_token_accuracy": 0.4348447024822235, + "num_tokens": 7841346060.0, + "step": 15340 + }, + { + "epoch": 4.148458626284478, + "grad_norm": 3.109375, + "learning_rate": 0.0033356234866685486, + "loss": 2.7526, + "mean_token_accuracy": 0.44561752676963806, + "num_tokens": 7841770871.0, + "step": 15341 + }, + { + "epoch": 4.1487290427257975, + "grad_norm": 2.609375, + "learning_rate": 0.003334797214261473, + "loss": 2.9253, + "mean_token_accuracy": 0.43717411160469055, + "num_tokens": 7842295046.0, + "step": 15342 + }, + { + "epoch": 4.1489994591671175, + "grad_norm": 3.734375, + "learning_rate": 0.0033339711770453595, + "loss": 2.7242, + "mean_token_accuracy": 0.44707781076431274, + "num_tokens": 7842819304.0, + "step": 15343 + }, + { + "epoch": 4.149269875608437, + "grad_norm": 3.171875, + "learning_rate": 0.003333145375045553, + "loss": 2.817, + "mean_token_accuracy": 0.42012137174606323, + "num_tokens": 7843343523.0, + "step": 15344 + }, + { + "epoch": 4.149540292049757, + "grad_norm": 3.484375, + "learning_rate": 0.003332319808287394, + "loss": 2.6697, + "mean_token_accuracy": 0.4541343152523041, + "num_tokens": 7843826410.0, + "step": 15345 + }, + { + "epoch": 4.149810708491076, + "grad_norm": 3.25, + "learning_rate": 0.003331494476796211, + "loss": 2.7521, + "mean_token_accuracy": 0.4770911931991577, + "num_tokens": 7844290547.0, + "step": 15346 + }, + { + "epoch": 4.150081124932396, + "grad_norm": 3.515625, + "learning_rate": 0.0033306693805973298, + "loss": 2.9184, + "mean_token_accuracy": 0.4097370207309723, + "num_tokens": 7844814742.0, + "step": 15347 + }, + { + "epoch": 4.150351541373715, + "grad_norm": 3.359375, + "learning_rate": 0.0033298445197160664, + "loss": 2.933, + "mean_token_accuracy": 0.4401592016220093, + "num_tokens": 7845339003.0, + "step": 15348 + }, + { + "epoch": 4.150621957815035, + "grad_norm": 3.515625, + "learning_rate": 0.0033290198941777305, + "loss": 2.6964, + "mean_token_accuracy": 0.44096559286117554, + "num_tokens": 7845863240.0, + "step": 15349 + }, + { + "epoch": 4.150892374256355, + "grad_norm": 2.984375, + "learning_rate": 0.0033281955040076216, + "loss": 2.8192, + "mean_token_accuracy": 0.443420946598053, + "num_tokens": 7846387335.0, + "step": 15350 + }, + { + "epoch": 4.151162790697675, + "grad_norm": 88.5, + "learning_rate": 0.0033273713492310377, + "loss": 9.581, + "mean_token_accuracy": 0.03739847242832184, + "num_tokens": 7846911581.0, + "step": 15351 + }, + { + "epoch": 4.151433207138994, + "grad_norm": 8.6875, + "learning_rate": 0.0033265474298732646, + "loss": 3.1041, + "mean_token_accuracy": 0.4067547917366028, + "num_tokens": 7847435702.0, + "step": 15352 + }, + { + "epoch": 4.151703623580314, + "grad_norm": 2.59375, + "learning_rate": 0.0033257237459595806, + "loss": 2.8339, + "mean_token_accuracy": 0.4341104030609131, + "num_tokens": 7847959862.0, + "step": 15353 + }, + { + "epoch": 4.151974040021633, + "grad_norm": 3.109375, + "learning_rate": 0.003324900297515262, + "loss": 2.734, + "mean_token_accuracy": 0.4536890983581543, + "num_tokens": 7848484139.0, + "step": 15354 + }, + { + "epoch": 4.152244456462953, + "grad_norm": 3.65625, + "learning_rate": 0.0033240770845655732, + "loss": 2.8545, + "mean_token_accuracy": 0.43376368284225464, + "num_tokens": 7848957223.0, + "step": 15355 + }, + { + "epoch": 4.1525148729042725, + "grad_norm": 3.40625, + "learning_rate": 0.0033232541071357715, + "loss": 2.5459, + "mean_token_accuracy": 0.4509691894054413, + "num_tokens": 7849473724.0, + "step": 15356 + }, + { + "epoch": 4.1527852893455925, + "grad_norm": 3.265625, + "learning_rate": 0.0033224313652511117, + "loss": 2.9152, + "mean_token_accuracy": 0.4369064271450043, + "num_tokens": 7849997924.0, + "step": 15357 + }, + { + "epoch": 4.153055705786912, + "grad_norm": 4.53125, + "learning_rate": 0.0033216088589368364, + "loss": 2.7135, + "mean_token_accuracy": 0.4585225284099579, + "num_tokens": 7850522177.0, + "step": 15358 + }, + { + "epoch": 4.153326122228232, + "grad_norm": 4.34375, + "learning_rate": 0.0033207865882181786, + "loss": 2.928, + "mean_token_accuracy": 0.4390983581542969, + "num_tokens": 7851007979.0, + "step": 15359 + }, + { + "epoch": 4.153596538669551, + "grad_norm": 4.875, + "learning_rate": 0.0033199645531203728, + "loss": 2.5892, + "mean_token_accuracy": 0.49962669610977173, + "num_tokens": 7851520333.0, + "step": 15360 + }, + { + "epoch": 4.153866955110871, + "grad_norm": 3.75, + "learning_rate": 0.0033191427536686403, + "loss": 2.986, + "mean_token_accuracy": 0.4417804479598999, + "num_tokens": 7851977197.0, + "step": 15361 + }, + { + "epoch": 4.15413737155219, + "grad_norm": 4.875, + "learning_rate": 0.003318321189888195, + "loss": 2.7542, + "mean_token_accuracy": 0.4525371789932251, + "num_tokens": 7852501437.0, + "step": 15362 + }, + { + "epoch": 4.15440778799351, + "grad_norm": 3.265625, + "learning_rate": 0.0033174998618042484, + "loss": 2.7713, + "mean_token_accuracy": 0.4340832531452179, + "num_tokens": 7853025660.0, + "step": 15363 + }, + { + "epoch": 4.15467820443483, + "grad_norm": 3.703125, + "learning_rate": 0.0033166787694419953, + "loss": 2.9718, + "mean_token_accuracy": 0.3938608169555664, + "num_tokens": 7853549845.0, + "step": 15364 + }, + { + "epoch": 4.15494862087615, + "grad_norm": 3.234375, + "learning_rate": 0.0033158579128266365, + "loss": 3.0006, + "mean_token_accuracy": 0.4248168170452118, + "num_tokens": 7854073974.0, + "step": 15365 + }, + { + "epoch": 4.155219037317469, + "grad_norm": 3.609375, + "learning_rate": 0.003315037291983353, + "loss": 2.7994, + "mean_token_accuracy": 0.46273642778396606, + "num_tokens": 7854533769.0, + "step": 15366 + }, + { + "epoch": 4.155489453758788, + "grad_norm": 3.03125, + "learning_rate": 0.003314216906937325, + "loss": 2.7566, + "mean_token_accuracy": 0.44152581691741943, + "num_tokens": 7855057959.0, + "step": 15367 + }, + { + "epoch": 4.155759870200108, + "grad_norm": 3.765625, + "learning_rate": 0.0033133967577137257, + "loss": 2.7239, + "mean_token_accuracy": 0.486236572265625, + "num_tokens": 7855582195.0, + "step": 15368 + }, + { + "epoch": 4.156030286641427, + "grad_norm": 3.5, + "learning_rate": 0.0033125768443377185, + "loss": 2.6537, + "mean_token_accuracy": 0.4301486611366272, + "num_tokens": 7856106427.0, + "step": 15369 + }, + { + "epoch": 4.1563007030827475, + "grad_norm": 2.9375, + "learning_rate": 0.0033117571668344593, + "loss": 2.6794, + "mean_token_accuracy": 0.44933897256851196, + "num_tokens": 7856630641.0, + "step": 15370 + }, + { + "epoch": 4.156571119524067, + "grad_norm": 84.0, + "learning_rate": 0.003310937725229103, + "loss": 8.9408, + "mean_token_accuracy": 0.025956016033887863, + "num_tokens": 7857099437.0, + "step": 15371 + }, + { + "epoch": 4.156841535965387, + "grad_norm": 6.0, + "learning_rate": 0.0033101185195467875, + "loss": 2.8679, + "mean_token_accuracy": 0.43637388944625854, + "num_tokens": 7857618927.0, + "step": 15372 + }, + { + "epoch": 4.157111952406706, + "grad_norm": 3.234375, + "learning_rate": 0.003309299549812649, + "loss": 3.0135, + "mean_token_accuracy": 0.4149240255355835, + "num_tokens": 7858143183.0, + "step": 15373 + }, + { + "epoch": 4.157382368848026, + "grad_norm": 3.65625, + "learning_rate": 0.0033084808160518176, + "loss": 3.0031, + "mean_token_accuracy": 0.41404658555984497, + "num_tokens": 7858667379.0, + "step": 15374 + }, + { + "epoch": 4.157652785289345, + "grad_norm": 4.125, + "learning_rate": 0.003307662318289415, + "loss": 2.9125, + "mean_token_accuracy": 0.43830186128616333, + "num_tokens": 7859141591.0, + "step": 15375 + }, + { + "epoch": 4.157923201730665, + "grad_norm": 4.0, + "learning_rate": 0.003306844056550553, + "loss": 2.8219, + "mean_token_accuracy": 0.44440436363220215, + "num_tokens": 7859665777.0, + "step": 15376 + }, + { + "epoch": 4.1581936181719845, + "grad_norm": 4.09375, + "learning_rate": 0.0033060260308603413, + "loss": 2.8089, + "mean_token_accuracy": 0.427484393119812, + "num_tokens": 7860189954.0, + "step": 15377 + }, + { + "epoch": 4.158464034613305, + "grad_norm": 3.71875, + "learning_rate": 0.0033052082412438778, + "loss": 2.9314, + "mean_token_accuracy": 0.42241355776786804, + "num_tokens": 7860664254.0, + "step": 15378 + }, + { + "epoch": 4.158734451054624, + "grad_norm": 4.0, + "learning_rate": 0.0033043906877262513, + "loss": 2.7149, + "mean_token_accuracy": 0.44120123982429504, + "num_tokens": 7861188475.0, + "step": 15379 + }, + { + "epoch": 4.159004867495944, + "grad_norm": 2.8125, + "learning_rate": 0.0033035733703325534, + "loss": 3.0239, + "mean_token_accuracy": 0.41885823011398315, + "num_tokens": 7861712698.0, + "step": 15380 + }, + { + "epoch": 4.159275283937263, + "grad_norm": 3.640625, + "learning_rate": 0.003302756289087857, + "loss": 2.8881, + "mean_token_accuracy": 0.4117460250854492, + "num_tokens": 7862236734.0, + "step": 15381 + }, + { + "epoch": 4.159545700378583, + "grad_norm": 3.453125, + "learning_rate": 0.0033019394440172312, + "loss": 2.8306, + "mean_token_accuracy": 0.43271908164024353, + "num_tokens": 7862761005.0, + "step": 15382 + }, + { + "epoch": 4.159816116819902, + "grad_norm": 4.375, + "learning_rate": 0.003301122835145745, + "loss": 2.8594, + "mean_token_accuracy": 0.4717612862586975, + "num_tokens": 7863222287.0, + "step": 15383 + }, + { + "epoch": 4.1600865332612225, + "grad_norm": 3.328125, + "learning_rate": 0.003300306462498449, + "loss": 2.7385, + "mean_token_accuracy": 0.44370681047439575, + "num_tokens": 7863742600.0, + "step": 15384 + }, + { + "epoch": 4.160356949702542, + "grad_norm": 4.09375, + "learning_rate": 0.0032994903261003962, + "loss": 2.7809, + "mean_token_accuracy": 0.4278936982154846, + "num_tokens": 7864266733.0, + "step": 15385 + }, + { + "epoch": 4.160627366143862, + "grad_norm": 3.75, + "learning_rate": 0.003298674425976626, + "loss": 2.4748, + "mean_token_accuracy": 0.49497199058532715, + "num_tokens": 7864761353.0, + "step": 15386 + }, + { + "epoch": 4.160897782585181, + "grad_norm": 3.484375, + "learning_rate": 0.0032978587621521694, + "loss": 2.8952, + "mean_token_accuracy": 0.4313022792339325, + "num_tokens": 7865278180.0, + "step": 15387 + }, + { + "epoch": 4.161168199026501, + "grad_norm": 3.5, + "learning_rate": 0.00329704333465206, + "loss": 2.7993, + "mean_token_accuracy": 0.4430837631225586, + "num_tokens": 7865802455.0, + "step": 15388 + }, + { + "epoch": 4.16143861546782, + "grad_norm": 4.53125, + "learning_rate": 0.0032962281435013107, + "loss": 2.9256, + "mean_token_accuracy": 0.43770483136177063, + "num_tokens": 7866326701.0, + "step": 15389 + }, + { + "epoch": 4.16170903190914, + "grad_norm": 4.53125, + "learning_rate": 0.0032954131887249372, + "loss": 2.7999, + "mean_token_accuracy": 0.4364297389984131, + "num_tokens": 7866850867.0, + "step": 15390 + }, + { + "epoch": 4.1619794483504595, + "grad_norm": 78.5, + "learning_rate": 0.0032945984703479482, + "loss": 10.2361, + "mean_token_accuracy": 0.0074685681611299515, + "num_tokens": 7867283021.0, + "step": 15391 + }, + { + "epoch": 4.16224986479178, + "grad_norm": 6.375, + "learning_rate": 0.003293783988395338, + "loss": 3.0069, + "mean_token_accuracy": 0.40801751613616943, + "num_tokens": 7867807177.0, + "step": 15392 + }, + { + "epoch": 4.162520281233099, + "grad_norm": 2.96875, + "learning_rate": 0.003292969742892095, + "loss": 2.9649, + "mean_token_accuracy": 0.43648797273635864, + "num_tokens": 7868293606.0, + "step": 15393 + }, + { + "epoch": 4.162790697674419, + "grad_norm": 4.03125, + "learning_rate": 0.0032921557338632065, + "loss": 2.7956, + "mean_token_accuracy": 0.41800037026405334, + "num_tokens": 7868817841.0, + "step": 15394 + }, + { + "epoch": 4.163061114115738, + "grad_norm": 3.75, + "learning_rate": 0.0032913419613336482, + "loss": 2.8463, + "mean_token_accuracy": 0.44456207752227783, + "num_tokens": 7869342020.0, + "step": 15395 + }, + { + "epoch": 4.163331530557058, + "grad_norm": 3.078125, + "learning_rate": 0.003290528425328385, + "loss": 2.8121, + "mean_token_accuracy": 0.44819560647010803, + "num_tokens": 7869846614.0, + "step": 15396 + }, + { + "epoch": 4.163601946998377, + "grad_norm": 3.765625, + "learning_rate": 0.0032897151258723858, + "loss": 2.906, + "mean_token_accuracy": 0.42532408237457275, + "num_tokens": 7870370775.0, + "step": 15397 + }, + { + "epoch": 4.1638723634396975, + "grad_norm": 3.734375, + "learning_rate": 0.0032889020629906004, + "loss": 2.836, + "mean_token_accuracy": 0.4577556848526001, + "num_tokens": 7870848003.0, + "step": 15398 + }, + { + "epoch": 4.164142779881017, + "grad_norm": 3.59375, + "learning_rate": 0.003288089236707974, + "loss": 2.7706, + "mean_token_accuracy": 0.42432284355163574, + "num_tokens": 7871340886.0, + "step": 15399 + }, + { + "epoch": 4.164413196322337, + "grad_norm": 3.296875, + "learning_rate": 0.003287276647049451, + "loss": 2.6392, + "mean_token_accuracy": 0.4968840479850769, + "num_tokens": 7871808985.0, + "step": 15400 + }, + { + "epoch": 4.164683612763656, + "grad_norm": 3.09375, + "learning_rate": 0.0032864642940399623, + "loss": 2.8986, + "mean_token_accuracy": 0.4121342897415161, + "num_tokens": 7872322793.0, + "step": 15401 + }, + { + "epoch": 4.164954029204976, + "grad_norm": 3.25, + "learning_rate": 0.003285652177704431, + "loss": 2.6624, + "mean_token_accuracy": 0.437863290309906, + "num_tokens": 7872847035.0, + "step": 15402 + }, + { + "epoch": 4.165224445646295, + "grad_norm": 39.0, + "learning_rate": 0.0032848402980677777, + "loss": 2.8323, + "mean_token_accuracy": 0.42612263560295105, + "num_tokens": 7873371302.0, + "step": 15403 + }, + { + "epoch": 4.165494862087615, + "grad_norm": 5.5, + "learning_rate": 0.0032840286551549147, + "loss": 2.9989, + "mean_token_accuracy": 0.40066656470298767, + "num_tokens": 7873895547.0, + "step": 15404 + }, + { + "epoch": 4.1657652785289345, + "grad_norm": 2.8125, + "learning_rate": 0.003283217248990742, + "loss": 2.9106, + "mean_token_accuracy": 0.4293598532676697, + "num_tokens": 7874407083.0, + "step": 15405 + }, + { + "epoch": 4.166035694970255, + "grad_norm": 4.0, + "learning_rate": 0.0032824060796001597, + "loss": 2.7694, + "mean_token_accuracy": 0.437301903963089, + "num_tokens": 7874931357.0, + "step": 15406 + }, + { + "epoch": 4.166306111411574, + "grad_norm": 3.359375, + "learning_rate": 0.003281595147008052, + "loss": 2.7646, + "mean_token_accuracy": 0.437913715839386, + "num_tokens": 7875455411.0, + "step": 15407 + }, + { + "epoch": 4.166576527852893, + "grad_norm": 4.03125, + "learning_rate": 0.003280784451239306, + "loss": 2.9034, + "mean_token_accuracy": 0.4206579327583313, + "num_tokens": 7875973756.0, + "step": 15408 + }, + { + "epoch": 4.166846944294213, + "grad_norm": 3.265625, + "learning_rate": 0.003279973992318794, + "loss": 2.675, + "mean_token_accuracy": 0.4463437497615814, + "num_tokens": 7876498026.0, + "step": 15409 + }, + { + "epoch": 4.167117360735532, + "grad_norm": 3.140625, + "learning_rate": 0.0032791637702713813, + "loss": 2.8885, + "mean_token_accuracy": 0.438839316368103, + "num_tokens": 7877022058.0, + "step": 15410 + }, + { + "epoch": 4.167387777176852, + "grad_norm": 22.125, + "learning_rate": 0.0032783537851219305, + "loss": 8.9422, + "mean_token_accuracy": 0.03572545945644379, + "num_tokens": 7877546307.0, + "step": 15411 + }, + { + "epoch": 4.167658193618172, + "grad_norm": 7.53125, + "learning_rate": 0.003277544036895295, + "loss": 3.0789, + "mean_token_accuracy": 0.4378401041030884, + "num_tokens": 7878041777.0, + "step": 15412 + }, + { + "epoch": 4.167928610059492, + "grad_norm": 2.828125, + "learning_rate": 0.003276734525616316, + "loss": 2.9424, + "mean_token_accuracy": 0.4236468970775604, + "num_tokens": 7878566052.0, + "step": 15413 + }, + { + "epoch": 4.168199026500811, + "grad_norm": 3.25, + "learning_rate": 0.003275925251309836, + "loss": 2.892, + "mean_token_accuracy": 0.4423137903213501, + "num_tokens": 7879090208.0, + "step": 15414 + }, + { + "epoch": 4.168469442942131, + "grad_norm": 2.890625, + "learning_rate": 0.003275116214000684, + "loss": 2.6881, + "mean_token_accuracy": 0.448065847158432, + "num_tokens": 7879614444.0, + "step": 15415 + }, + { + "epoch": 4.16873985938345, + "grad_norm": 4.0, + "learning_rate": 0.003274307413713683, + "loss": 2.7986, + "mean_token_accuracy": 0.42911356687545776, + "num_tokens": 7880138600.0, + "step": 15416 + }, + { + "epoch": 4.16901027582477, + "grad_norm": 3.671875, + "learning_rate": 0.0032734988504736495, + "loss": 2.7396, + "mean_token_accuracy": 0.4428364932537079, + "num_tokens": 7880662842.0, + "step": 15417 + }, + { + "epoch": 4.169280692266089, + "grad_norm": 3.203125, + "learning_rate": 0.0032726905243053955, + "loss": 2.5829, + "mean_token_accuracy": 0.4623543620109558, + "num_tokens": 7881187071.0, + "step": 15418 + }, + { + "epoch": 4.1695511087074095, + "grad_norm": 3.171875, + "learning_rate": 0.0032718824352337196, + "loss": 2.8169, + "mean_token_accuracy": 0.42881298065185547, + "num_tokens": 7881696804.0, + "step": 15419 + }, + { + "epoch": 4.169821525148729, + "grad_norm": 3.390625, + "learning_rate": 0.0032710745832834186, + "loss": 2.8384, + "mean_token_accuracy": 0.44254443049430847, + "num_tokens": 7882172754.0, + "step": 15420 + }, + { + "epoch": 4.170091941590049, + "grad_norm": 3.34375, + "learning_rate": 0.00327026696847928, + "loss": 2.8444, + "mean_token_accuracy": 0.4297010898590088, + "num_tokens": 7882696767.0, + "step": 15421 + }, + { + "epoch": 4.170362358031368, + "grad_norm": 3.65625, + "learning_rate": 0.003269459590846079, + "loss": 2.8597, + "mean_token_accuracy": 0.4236059784889221, + "num_tokens": 7883221046.0, + "step": 15422 + }, + { + "epoch": 4.170632774472688, + "grad_norm": 3.953125, + "learning_rate": 0.003268652450408594, + "loss": 2.9767, + "mean_token_accuracy": 0.42834287881851196, + "num_tokens": 7883745313.0, + "step": 15423 + }, + { + "epoch": 4.170903190914007, + "grad_norm": 4.53125, + "learning_rate": 0.003267845547191589, + "loss": 3.0843, + "mean_token_accuracy": 0.386819064617157, + "num_tokens": 7884269448.0, + "step": 15424 + }, + { + "epoch": 4.171173607355327, + "grad_norm": 3.34375, + "learning_rate": 0.0032670388812198192, + "loss": 2.7478, + "mean_token_accuracy": 0.45204558968544006, + "num_tokens": 7884793726.0, + "step": 15425 + }, + { + "epoch": 4.171444023796647, + "grad_norm": 3.953125, + "learning_rate": 0.0032662324525180387, + "loss": 2.8054, + "mean_token_accuracy": 0.4246165454387665, + "num_tokens": 7885317943.0, + "step": 15426 + }, + { + "epoch": 4.171714440237967, + "grad_norm": 3.796875, + "learning_rate": 0.003265426261110991, + "loss": 2.7137, + "mean_token_accuracy": 0.45802080631256104, + "num_tokens": 7885842070.0, + "step": 15427 + }, + { + "epoch": 4.171984856679286, + "grad_norm": 4.59375, + "learning_rate": 0.0032646203070234095, + "loss": 2.8889, + "mean_token_accuracy": 0.4302443265914917, + "num_tokens": 7886318888.0, + "step": 15428 + }, + { + "epoch": 4.172255273120606, + "grad_norm": 4.0625, + "learning_rate": 0.0032638145902800265, + "loss": 2.7163, + "mean_token_accuracy": 0.4374452829360962, + "num_tokens": 7886843145.0, + "step": 15429 + }, + { + "epoch": 4.172525689561925, + "grad_norm": 3.765625, + "learning_rate": 0.003263009110905561, + "loss": 2.982, + "mean_token_accuracy": 0.41964858770370483, + "num_tokens": 7887367383.0, + "step": 15430 + }, + { + "epoch": 4.172796106003245, + "grad_norm": 32.75, + "learning_rate": 0.003262203868924731, + "loss": 9.0099, + "mean_token_accuracy": 0.018684595823287964, + "num_tokens": 7887853776.0, + "step": 15431 + }, + { + "epoch": 4.173066522444564, + "grad_norm": 7.625, + "learning_rate": 0.0032613988643622393, + "loss": 2.8992, + "mean_token_accuracy": 0.4314723014831543, + "num_tokens": 7888360166.0, + "step": 15432 + }, + { + "epoch": 4.1733369388858845, + "grad_norm": 3.34375, + "learning_rate": 0.0032605940972427892, + "loss": 2.6277, + "mean_token_accuracy": 0.44794341921806335, + "num_tokens": 7888829385.0, + "step": 15433 + }, + { + "epoch": 4.173607355327204, + "grad_norm": 2.921875, + "learning_rate": 0.0032597895675910734, + "loss": 2.9172, + "mean_token_accuracy": 0.4346961975097656, + "num_tokens": 7889324512.0, + "step": 15434 + }, + { + "epoch": 4.173877771768524, + "grad_norm": 2.734375, + "learning_rate": 0.003258985275431777, + "loss": 2.6707, + "mean_token_accuracy": 0.45374345779418945, + "num_tokens": 7889848675.0, + "step": 15435 + }, + { + "epoch": 4.174148188209843, + "grad_norm": 3.015625, + "learning_rate": 0.0032581812207895743, + "loss": 2.6725, + "mean_token_accuracy": 0.4492228627204895, + "num_tokens": 7890372843.0, + "step": 15436 + }, + { + "epoch": 4.174418604651163, + "grad_norm": 3.171875, + "learning_rate": 0.0032573774036891417, + "loss": 2.8356, + "mean_token_accuracy": 0.445244163274765, + "num_tokens": 7890897056.0, + "step": 15437 + }, + { + "epoch": 4.174689021092482, + "grad_norm": 3.203125, + "learning_rate": 0.003256573824155139, + "loss": 2.7442, + "mean_token_accuracy": 0.43947499990463257, + "num_tokens": 7891421305.0, + "step": 15438 + }, + { + "epoch": 4.174959437533802, + "grad_norm": 2.6875, + "learning_rate": 0.0032557704822122214, + "loss": 2.8348, + "mean_token_accuracy": 0.4380708932876587, + "num_tokens": 7891944439.0, + "step": 15439 + }, + { + "epoch": 4.175229853975122, + "grad_norm": 3.125, + "learning_rate": 0.0032549673778850432, + "loss": 2.5804, + "mean_token_accuracy": 0.4505268335342407, + "num_tokens": 7892468619.0, + "step": 15440 + }, + { + "epoch": 4.175500270416442, + "grad_norm": 3.390625, + "learning_rate": 0.003254164511198241, + "loss": 2.5911, + "mean_token_accuracy": 0.45043593645095825, + "num_tokens": 7892992834.0, + "step": 15441 + }, + { + "epoch": 4.175770686857761, + "grad_norm": 3.875, + "learning_rate": 0.0032533618821764497, + "loss": 2.8837, + "mean_token_accuracy": 0.41422486305236816, + "num_tokens": 7893517054.0, + "step": 15442 + }, + { + "epoch": 4.176041103299081, + "grad_norm": 3.03125, + "learning_rate": 0.003252559490844298, + "loss": 2.6116, + "mean_token_accuracy": 0.46111881732940674, + "num_tokens": 7894041196.0, + "step": 15443 + }, + { + "epoch": 4.1763115197404, + "grad_norm": 3.5625, + "learning_rate": 0.0032517573372264036, + "loss": 2.7332, + "mean_token_accuracy": 0.44786274433135986, + "num_tokens": 7894565359.0, + "step": 15444 + }, + { + "epoch": 4.17658193618172, + "grad_norm": 3.234375, + "learning_rate": 0.0032509554213473826, + "loss": 2.8525, + "mean_token_accuracy": 0.4408913254737854, + "num_tokens": 7895075762.0, + "step": 15445 + }, + { + "epoch": 4.176852352623039, + "grad_norm": 3.109375, + "learning_rate": 0.003250153743231834, + "loss": 2.7475, + "mean_token_accuracy": 0.4882184863090515, + "num_tokens": 7895535742.0, + "step": 15446 + }, + { + "epoch": 4.1771227690643595, + "grad_norm": 4.0625, + "learning_rate": 0.0032493523029043627, + "loss": 2.6721, + "mean_token_accuracy": 0.4439224898815155, + "num_tokens": 7896059847.0, + "step": 15447 + }, + { + "epoch": 4.177393185505679, + "grad_norm": 3.75, + "learning_rate": 0.003248551100389554, + "loss": 2.6504, + "mean_token_accuracy": 0.44113689661026, + "num_tokens": 7896584079.0, + "step": 15448 + }, + { + "epoch": 4.177663601946998, + "grad_norm": 3.46875, + "learning_rate": 0.003247750135711995, + "loss": 2.8456, + "mean_token_accuracy": 0.44000911712646484, + "num_tokens": 7897108343.0, + "step": 15449 + }, + { + "epoch": 4.177934018388318, + "grad_norm": 4.75, + "learning_rate": 0.0032469494088962587, + "loss": 2.635, + "mean_token_accuracy": 0.4640252888202667, + "num_tokens": 7897579101.0, + "step": 15450 + }, + { + "epoch": 4.178204434829637, + "grad_norm": 13.75, + "learning_rate": 0.0032461489199669146, + "loss": 9.4688, + "mean_token_accuracy": 0.028776366263628006, + "num_tokens": 7898100035.0, + "step": 15451 + }, + { + "epoch": 4.178474851270957, + "grad_norm": 6.84375, + "learning_rate": 0.003245348668948525, + "loss": 2.8756, + "mean_token_accuracy": 0.4503883123397827, + "num_tokens": 7898624256.0, + "step": 15452 + }, + { + "epoch": 4.1787452677122765, + "grad_norm": 2.78125, + "learning_rate": 0.003244548655865641, + "loss": 2.6042, + "mean_token_accuracy": 0.45706337690353394, + "num_tokens": 7899139642.0, + "step": 15453 + }, + { + "epoch": 4.179015684153597, + "grad_norm": 2.96875, + "learning_rate": 0.003243748880742814, + "loss": 2.7364, + "mean_token_accuracy": 0.4382801651954651, + "num_tokens": 7899663842.0, + "step": 15454 + }, + { + "epoch": 4.179286100594916, + "grad_norm": 3.765625, + "learning_rate": 0.0032429493436045816, + "loss": 2.7095, + "mean_token_accuracy": 0.44087621569633484, + "num_tokens": 7900142080.0, + "step": 15455 + }, + { + "epoch": 4.179556517036236, + "grad_norm": 3.53125, + "learning_rate": 0.0032421500444754724, + "loss": 2.9631, + "mean_token_accuracy": 0.43060287833213806, + "num_tokens": 7900666198.0, + "step": 15456 + }, + { + "epoch": 4.179826933477555, + "grad_norm": 6.71875, + "learning_rate": 0.0032413509833800164, + "loss": 2.7454, + "mean_token_accuracy": 0.4665203094482422, + "num_tokens": 7901190249.0, + "step": 15457 + }, + { + "epoch": 4.180097349918875, + "grad_norm": 2.296875, + "learning_rate": 0.0032405521603427285, + "loss": 2.8311, + "mean_token_accuracy": 0.43574222922325134, + "num_tokens": 7901714423.0, + "step": 15458 + }, + { + "epoch": 4.180367766360194, + "grad_norm": 4.40625, + "learning_rate": 0.003239753575388118, + "loss": 2.8899, + "mean_token_accuracy": 0.4357849359512329, + "num_tokens": 7902227413.0, + "step": 15459 + }, + { + "epoch": 4.1806381828015144, + "grad_norm": 3.28125, + "learning_rate": 0.0032389552285406885, + "loss": 2.8286, + "mean_token_accuracy": 0.4532061517238617, + "num_tokens": 7902751670.0, + "step": 15460 + }, + { + "epoch": 4.180908599242834, + "grad_norm": 3.890625, + "learning_rate": 0.00323815711982494, + "loss": 2.7078, + "mean_token_accuracy": 0.4270347058773041, + "num_tokens": 7903275852.0, + "step": 15461 + }, + { + "epoch": 4.181179015684154, + "grad_norm": 3.0, + "learning_rate": 0.0032373592492653536, + "loss": 2.8547, + "mean_token_accuracy": 0.4235803186893463, + "num_tokens": 7903800000.0, + "step": 15462 + }, + { + "epoch": 4.181449432125473, + "grad_norm": 3.65625, + "learning_rate": 0.0032365616168864163, + "loss": 2.6605, + "mean_token_accuracy": 0.44261354207992554, + "num_tokens": 7904324272.0, + "step": 15463 + }, + { + "epoch": 4.181719848566793, + "grad_norm": 3.171875, + "learning_rate": 0.0032357642227125987, + "loss": 2.9146, + "mean_token_accuracy": 0.44449859857559204, + "num_tokens": 7904848396.0, + "step": 15464 + }, + { + "epoch": 4.181990265008112, + "grad_norm": 3.390625, + "learning_rate": 0.003234967066768368, + "loss": 2.5849, + "mean_token_accuracy": 0.45428696274757385, + "num_tokens": 7905372633.0, + "step": 15465 + }, + { + "epoch": 4.182260681449432, + "grad_norm": 2.890625, + "learning_rate": 0.0032341701490781827, + "loss": 2.7718, + "mean_token_accuracy": 0.44101202487945557, + "num_tokens": 7905874884.0, + "step": 15466 + }, + { + "epoch": 4.1825310978907515, + "grad_norm": 3.640625, + "learning_rate": 0.003233373469666497, + "loss": 2.8168, + "mean_token_accuracy": 0.43605172634124756, + "num_tokens": 7906398884.0, + "step": 15467 + }, + { + "epoch": 4.182801514332072, + "grad_norm": 3.40625, + "learning_rate": 0.0032325770285577503, + "loss": 2.7993, + "mean_token_accuracy": 0.4290136694908142, + "num_tokens": 7906923078.0, + "step": 15468 + }, + { + "epoch": 4.183071930773391, + "grad_norm": 3.71875, + "learning_rate": 0.003231780825776385, + "loss": 2.8744, + "mean_token_accuracy": 0.4345351755619049, + "num_tokens": 7907400858.0, + "step": 15469 + }, + { + "epoch": 4.183342347214711, + "grad_norm": 4.25, + "learning_rate": 0.0032309848613468286, + "loss": 2.764, + "mean_token_accuracy": 0.44221845269203186, + "num_tokens": 7907925128.0, + "step": 15470 + }, + { + "epoch": 4.18361276365603, + "grad_norm": 8.875, + "learning_rate": 0.0032301891352935026, + "loss": 8.2915, + "mean_token_accuracy": 0.04706336930394173, + "num_tokens": 7908449310.0, + "step": 15471 + }, + { + "epoch": 4.18388318009735, + "grad_norm": 25.625, + "learning_rate": 0.0032293936476408243, + "loss": 2.5019, + "mean_token_accuracy": 0.47872281074523926, + "num_tokens": 7908948528.0, + "step": 15472 + }, + { + "epoch": 4.184153596538669, + "grad_norm": 6.09375, + "learning_rate": 0.0032285983984131996, + "loss": 2.9567, + "mean_token_accuracy": 0.42065590620040894, + "num_tokens": 7909472769.0, + "step": 15473 + }, + { + "epoch": 4.1844240129799894, + "grad_norm": 2.875, + "learning_rate": 0.00322780338763503, + "loss": 2.7333, + "mean_token_accuracy": 0.43202272057533264, + "num_tokens": 7909996922.0, + "step": 15474 + }, + { + "epoch": 4.184694429421309, + "grad_norm": 3.484375, + "learning_rate": 0.003227008615330711, + "loss": 2.8157, + "mean_token_accuracy": 0.44287198781967163, + "num_tokens": 7910492860.0, + "step": 15475 + }, + { + "epoch": 4.184964845862629, + "grad_norm": 3.546875, + "learning_rate": 0.003226214081524625, + "loss": 2.985, + "mean_token_accuracy": 0.43393412232398987, + "num_tokens": 7910980423.0, + "step": 15476 + }, + { + "epoch": 4.185235262303948, + "grad_norm": 3.671875, + "learning_rate": 0.0032254197862411527, + "loss": 2.9898, + "mean_token_accuracy": 0.40769296884536743, + "num_tokens": 7911504671.0, + "step": 15477 + }, + { + "epoch": 4.185505678745268, + "grad_norm": 3.6875, + "learning_rate": 0.0032246257295046672, + "loss": 2.7872, + "mean_token_accuracy": 0.4493933320045471, + "num_tokens": 7912028774.0, + "step": 15478 + }, + { + "epoch": 4.185776095186587, + "grad_norm": 4.28125, + "learning_rate": 0.003223831911339527, + "loss": 2.6604, + "mean_token_accuracy": 0.4560641944408417, + "num_tokens": 7912493741.0, + "step": 15479 + }, + { + "epoch": 4.186046511627907, + "grad_norm": 2.75, + "learning_rate": 0.003223038331770094, + "loss": 2.8842, + "mean_token_accuracy": 0.44772869348526, + "num_tokens": 7912968019.0, + "step": 15480 + }, + { + "epoch": 4.1863169280692265, + "grad_norm": 4.71875, + "learning_rate": 0.0032222449908207154, + "loss": 2.8971, + "mean_token_accuracy": 0.4490881562232971, + "num_tokens": 7913421817.0, + "step": 15481 + }, + { + "epoch": 4.186587344510547, + "grad_norm": 3.640625, + "learning_rate": 0.0032214518885157312, + "loss": 2.7947, + "mean_token_accuracy": 0.4504355490207672, + "num_tokens": 7913919394.0, + "step": 15482 + }, + { + "epoch": 4.186857760951866, + "grad_norm": 3.328125, + "learning_rate": 0.003220659024879481, + "loss": 2.831, + "mean_token_accuracy": 0.43625056743621826, + "num_tokens": 7914443627.0, + "step": 15483 + }, + { + "epoch": 4.187128177393186, + "grad_norm": 4.09375, + "learning_rate": 0.0032198663999362894, + "loss": 2.9689, + "mean_token_accuracy": 0.44947129487991333, + "num_tokens": 7914904471.0, + "step": 15484 + }, + { + "epoch": 4.187398593834505, + "grad_norm": 4.3125, + "learning_rate": 0.003219074013710474, + "loss": 2.5582, + "mean_token_accuracy": 0.4399995803833008, + "num_tokens": 7915428585.0, + "step": 15485 + }, + { + "epoch": 4.187669010275825, + "grad_norm": 3.828125, + "learning_rate": 0.0032182818662263523, + "loss": 2.7869, + "mean_token_accuracy": 0.4311954975128174, + "num_tokens": 7915939885.0, + "step": 15486 + }, + { + "epoch": 4.187939426717144, + "grad_norm": 3.640625, + "learning_rate": 0.0032174899575082246, + "loss": 2.8293, + "mean_token_accuracy": 0.4458611309528351, + "num_tokens": 7916464026.0, + "step": 15487 + }, + { + "epoch": 4.1882098431584645, + "grad_norm": 4.34375, + "learning_rate": 0.0032166982875803953, + "loss": 2.9095, + "mean_token_accuracy": 0.44431567192077637, + "num_tokens": 7916943682.0, + "step": 15488 + }, + { + "epoch": 4.188480259599784, + "grad_norm": 3.984375, + "learning_rate": 0.003215906856467149, + "loss": 2.974, + "mean_token_accuracy": 0.42795902490615845, + "num_tokens": 7917467897.0, + "step": 15489 + }, + { + "epoch": 4.188750676041103, + "grad_norm": 4.53125, + "learning_rate": 0.003215115664192774, + "loss": 2.6753, + "mean_token_accuracy": 0.46335598826408386, + "num_tokens": 7917992101.0, + "step": 15490 + }, + { + "epoch": 4.189021092482423, + "grad_norm": 8.9375, + "learning_rate": 0.0032143247107815427, + "loss": 8.8985, + "mean_token_accuracy": 0.005866366438567638, + "num_tokens": 7918516369.0, + "step": 15491 + }, + { + "epoch": 4.189291508923742, + "grad_norm": 5.96875, + "learning_rate": 0.0032135339962577267, + "loss": 2.7737, + "mean_token_accuracy": 0.4483402371406555, + "num_tokens": 7919040397.0, + "step": 15492 + }, + { + "epoch": 4.189561925365062, + "grad_norm": 2.671875, + "learning_rate": 0.0032127435206455866, + "loss": 2.8307, + "mean_token_accuracy": 0.41299670934677124, + "num_tokens": 7919564557.0, + "step": 15493 + }, + { + "epoch": 4.189832341806381, + "grad_norm": 2.921875, + "learning_rate": 0.0032119532839693737, + "loss": 2.8935, + "mean_token_accuracy": 0.4518604874610901, + "num_tokens": 7920088585.0, + "step": 15494 + }, + { + "epoch": 4.1901027582477015, + "grad_norm": 3.796875, + "learning_rate": 0.0032111632862533396, + "loss": 2.7398, + "mean_token_accuracy": 0.44223853945732117, + "num_tokens": 7920553174.0, + "step": 15495 + }, + { + "epoch": 4.190373174689021, + "grad_norm": 3.046875, + "learning_rate": 0.0032103735275217215, + "loss": 2.9187, + "mean_token_accuracy": 0.42523300647735596, + "num_tokens": 7921077394.0, + "step": 15496 + }, + { + "epoch": 4.190643591130341, + "grad_norm": 3.234375, + "learning_rate": 0.0032095840077987496, + "loss": 2.6992, + "mean_token_accuracy": 0.4430507719516754, + "num_tokens": 7921601533.0, + "step": 15497 + }, + { + "epoch": 4.19091400757166, + "grad_norm": 3.234375, + "learning_rate": 0.003208794727108652, + "loss": 2.6487, + "mean_token_accuracy": 0.4538821578025818, + "num_tokens": 7922125723.0, + "step": 15498 + }, + { + "epoch": 4.19118442401298, + "grad_norm": 3.265625, + "learning_rate": 0.0032080056854756427, + "loss": 2.8602, + "mean_token_accuracy": 0.4440404772758484, + "num_tokens": 7922649990.0, + "step": 15499 + }, + { + "epoch": 4.191454840454299, + "grad_norm": 4.125, + "learning_rate": 0.003207216882923935, + "loss": 2.9217, + "mean_token_accuracy": 0.43823665380477905, + "num_tokens": 7923174169.0, + "step": 15500 + }, + { + "epoch": 4.191725256895619, + "grad_norm": 3.25, + "learning_rate": 0.0032064283194777315, + "loss": 2.8023, + "mean_token_accuracy": 0.43949276208877563, + "num_tokens": 7923698443.0, + "step": 15501 + }, + { + "epoch": 4.191995673336939, + "grad_norm": 4.28125, + "learning_rate": 0.003205639995161225, + "loss": 2.9601, + "mean_token_accuracy": 0.43504875898361206, + "num_tokens": 7924183912.0, + "step": 15502 + }, + { + "epoch": 4.192266089778259, + "grad_norm": 3.4375, + "learning_rate": 0.003204851909998604, + "loss": 2.864, + "mean_token_accuracy": 0.4411906599998474, + "num_tokens": 7924708189.0, + "step": 15503 + }, + { + "epoch": 4.192536506219578, + "grad_norm": 3.890625, + "learning_rate": 0.0032040640640140527, + "loss": 2.7092, + "mean_token_accuracy": 0.44588175415992737, + "num_tokens": 7925214235.0, + "step": 15504 + }, + { + "epoch": 4.192806922660898, + "grad_norm": 3.21875, + "learning_rate": 0.0032032764572317396, + "loss": 2.8457, + "mean_token_accuracy": 0.43669790029525757, + "num_tokens": 7925738329.0, + "step": 15505 + }, + { + "epoch": 4.193077339102217, + "grad_norm": 4.03125, + "learning_rate": 0.0032024890896758373, + "loss": 2.7178, + "mean_token_accuracy": 0.44232118129730225, + "num_tokens": 7926262597.0, + "step": 15506 + }, + { + "epoch": 4.193347755543537, + "grad_norm": 3.75, + "learning_rate": 0.0032017019613704988, + "loss": 2.7158, + "mean_token_accuracy": 0.45002177357673645, + "num_tokens": 7926786774.0, + "step": 15507 + }, + { + "epoch": 4.193618171984856, + "grad_norm": 3.46875, + "learning_rate": 0.003200915072339877, + "loss": 2.9618, + "mean_token_accuracy": 0.4102964401245117, + "num_tokens": 7927269551.0, + "step": 15508 + }, + { + "epoch": 4.1938885884261765, + "grad_norm": 3.46875, + "learning_rate": 0.0032001284226081178, + "loss": 2.8165, + "mean_token_accuracy": 0.436343789100647, + "num_tokens": 7927793830.0, + "step": 15509 + }, + { + "epoch": 4.194159004867496, + "grad_norm": 6.9375, + "learning_rate": 0.0031993420121993557, + "loss": 2.5336, + "mean_token_accuracy": 0.48850977420806885, + "num_tokens": 7928276208.0, + "step": 15510 + }, + { + "epoch": 4.194429421308816, + "grad_norm": 11.5625, + "learning_rate": 0.0031985558411377203, + "loss": 8.3411, + "mean_token_accuracy": 0.0361662432551384, + "num_tokens": 7928800493.0, + "step": 15511 + }, + { + "epoch": 4.194699837750135, + "grad_norm": 4.59375, + "learning_rate": 0.003197769909447335, + "loss": 2.7786, + "mean_token_accuracy": 0.43404608964920044, + "num_tokens": 7929324664.0, + "step": 15512 + }, + { + "epoch": 4.194970254191455, + "grad_norm": 3.140625, + "learning_rate": 0.0031969842171523143, + "loss": 2.7352, + "mean_token_accuracy": 0.43110525608062744, + "num_tokens": 7929834817.0, + "step": 15513 + }, + { + "epoch": 4.195240670632774, + "grad_norm": 2.90625, + "learning_rate": 0.0031961987642767633, + "loss": 2.6772, + "mean_token_accuracy": 0.46071258187294006, + "num_tokens": 7930359026.0, + "step": 15514 + }, + { + "epoch": 4.195511087074094, + "grad_norm": 2.859375, + "learning_rate": 0.003195413550844785, + "loss": 2.809, + "mean_token_accuracy": 0.4478727877140045, + "num_tokens": 7930883289.0, + "step": 15515 + }, + { + "epoch": 4.195781503515414, + "grad_norm": 5.46875, + "learning_rate": 0.0031946285768804693, + "loss": 2.8507, + "mean_token_accuracy": 0.436353474855423, + "num_tokens": 7931407562.0, + "step": 15516 + }, + { + "epoch": 4.196051919956734, + "grad_norm": 3.390625, + "learning_rate": 0.0031938438424079026, + "loss": 2.7923, + "mean_token_accuracy": 0.4411708116531372, + "num_tokens": 7931931745.0, + "step": 15517 + }, + { + "epoch": 4.196322336398053, + "grad_norm": 4.09375, + "learning_rate": 0.0031930593474511653, + "loss": 2.9323, + "mean_token_accuracy": 0.4387643337249756, + "num_tokens": 7932455968.0, + "step": 15518 + }, + { + "epoch": 4.196592752839373, + "grad_norm": 3.640625, + "learning_rate": 0.0031922750920343264, + "loss": 2.7315, + "mean_token_accuracy": 0.45453834533691406, + "num_tokens": 7932937902.0, + "step": 15519 + }, + { + "epoch": 4.196863169280692, + "grad_norm": 41.75, + "learning_rate": 0.0031914910761814464, + "loss": 2.8217, + "mean_token_accuracy": 0.4258403182029724, + "num_tokens": 7933434729.0, + "step": 15520 + }, + { + "epoch": 4.197133585722012, + "grad_norm": 5.25, + "learning_rate": 0.0031907072999165856, + "loss": 2.7041, + "mean_token_accuracy": 0.45083093643188477, + "num_tokens": 7933958946.0, + "step": 15521 + }, + { + "epoch": 4.197404002163331, + "grad_norm": 2.75, + "learning_rate": 0.003189923763263787, + "loss": 2.9112, + "mean_token_accuracy": 0.41147178411483765, + "num_tokens": 7934483099.0, + "step": 15522 + }, + { + "epoch": 4.1976744186046515, + "grad_norm": 3.03125, + "learning_rate": 0.0031891404662470996, + "loss": 2.7542, + "mean_token_accuracy": 0.42979303002357483, + "num_tokens": 7935007367.0, + "step": 15523 + }, + { + "epoch": 4.197944835045971, + "grad_norm": 30.0, + "learning_rate": 0.0031883574088905514, + "loss": 2.9487, + "mean_token_accuracy": 0.4095957279205322, + "num_tokens": 7935531641.0, + "step": 15524 + }, + { + "epoch": 4.198215251487291, + "grad_norm": 4.90625, + "learning_rate": 0.0031875745912181693, + "loss": 2.7827, + "mean_token_accuracy": 0.4359995424747467, + "num_tokens": 7936055796.0, + "step": 15525 + }, + { + "epoch": 4.19848566792861, + "grad_norm": 2.796875, + "learning_rate": 0.003186792013253975, + "loss": 2.9821, + "mean_token_accuracy": 0.4351021349430084, + "num_tokens": 7936528126.0, + "step": 15526 + }, + { + "epoch": 4.19875608436993, + "grad_norm": 3.46875, + "learning_rate": 0.0031860096750219798, + "loss": 2.7922, + "mean_token_accuracy": 0.4398044943809509, + "num_tokens": 7937052401.0, + "step": 15527 + }, + { + "epoch": 4.199026500811249, + "grad_norm": 3.6875, + "learning_rate": 0.0031852275765461846, + "loss": 2.7183, + "mean_token_accuracy": 0.44995784759521484, + "num_tokens": 7937540454.0, + "step": 15528 + }, + { + "epoch": 4.199296917252569, + "grad_norm": 3.625, + "learning_rate": 0.0031844457178505908, + "loss": 2.7665, + "mean_token_accuracy": 0.44693151116371155, + "num_tokens": 7938010664.0, + "step": 15529 + }, + { + "epoch": 4.199567333693889, + "grad_norm": 4.125, + "learning_rate": 0.0031836640989591847, + "loss": 2.5505, + "mean_token_accuracy": 0.5035786628723145, + "num_tokens": 7938481841.0, + "step": 15530 + }, + { + "epoch": 4.199837750135208, + "grad_norm": 46.0, + "learning_rate": 0.003182882719895952, + "loss": 8.9168, + "mean_token_accuracy": 0.0019041431369259953, + "num_tokens": 7939006073.0, + "step": 15531 + }, + { + "epoch": 4.200108166576528, + "grad_norm": 6.53125, + "learning_rate": 0.0031821015806848645, + "loss": 2.9593, + "mean_token_accuracy": 0.4110071659088135, + "num_tokens": 7939530320.0, + "step": 15532 + }, + { + "epoch": 4.200378583017847, + "grad_norm": 3.359375, + "learning_rate": 0.003181320681349894, + "loss": 2.9048, + "mean_token_accuracy": 0.4118019640445709, + "num_tokens": 7940054541.0, + "step": 15533 + }, + { + "epoch": 4.200648999459167, + "grad_norm": 3.53125, + "learning_rate": 0.003180540021914995, + "loss": 2.8099, + "mean_token_accuracy": 0.4285121560096741, + "num_tokens": 7940578720.0, + "step": 15534 + }, + { + "epoch": 4.200919415900486, + "grad_norm": 3.5625, + "learning_rate": 0.003179759602404127, + "loss": 2.8121, + "mean_token_accuracy": 0.43993327021598816, + "num_tokens": 7941102990.0, + "step": 15535 + }, + { + "epoch": 4.201189832341806, + "grad_norm": 3.5, + "learning_rate": 0.003178979422841232, + "loss": 2.8751, + "mean_token_accuracy": 0.42610612511634827, + "num_tokens": 7941627134.0, + "step": 15536 + }, + { + "epoch": 4.201460248783126, + "grad_norm": 3.65625, + "learning_rate": 0.003178199483250245, + "loss": 2.9328, + "mean_token_accuracy": 0.42389756441116333, + "num_tokens": 7942151336.0, + "step": 15537 + }, + { + "epoch": 4.201730665224446, + "grad_norm": 4.0625, + "learning_rate": 0.003177419783655104, + "loss": 2.8057, + "mean_token_accuracy": 0.43128642439842224, + "num_tokens": 7942675477.0, + "step": 15538 + }, + { + "epoch": 4.202001081665765, + "grad_norm": 2.828125, + "learning_rate": 0.003176640324079728, + "loss": 2.6568, + "mean_token_accuracy": 0.4427608847618103, + "num_tokens": 7943199666.0, + "step": 15539 + }, + { + "epoch": 4.202271498107085, + "grad_norm": 3.4375, + "learning_rate": 0.003175861104548032, + "loss": 2.7318, + "mean_token_accuracy": 0.4567519426345825, + "num_tokens": 7943723870.0, + "step": 15540 + }, + { + "epoch": 4.202541914548404, + "grad_norm": 3.65625, + "learning_rate": 0.0031750821250839272, + "loss": 2.7407, + "mean_token_accuracy": 0.42566943168640137, + "num_tokens": 7944248055.0, + "step": 15541 + }, + { + "epoch": 4.202812330989724, + "grad_norm": 3.640625, + "learning_rate": 0.0031743033857113156, + "loss": 2.8377, + "mean_token_accuracy": 0.41669365763664246, + "num_tokens": 7944772205.0, + "step": 15542 + }, + { + "epoch": 4.2030827474310435, + "grad_norm": 3.328125, + "learning_rate": 0.003173524886454089, + "loss": 2.7868, + "mean_token_accuracy": 0.4399387836456299, + "num_tokens": 7945296413.0, + "step": 15543 + }, + { + "epoch": 4.203353163872364, + "grad_norm": 27.125, + "learning_rate": 0.0031727466273361337, + "loss": 2.6282, + "mean_token_accuracy": 0.46011781692504883, + "num_tokens": 7945769026.0, + "step": 15544 + }, + { + "epoch": 4.203623580313683, + "grad_norm": 6.8125, + "learning_rate": 0.0031719686083813326, + "loss": 3.0318, + "mean_token_accuracy": 0.4149894118309021, + "num_tokens": 7946293268.0, + "step": 15545 + }, + { + "epoch": 4.203893996755003, + "grad_norm": 3.09375, + "learning_rate": 0.003171190829613554, + "loss": 2.8285, + "mean_token_accuracy": 0.4500330090522766, + "num_tokens": 7946723890.0, + "step": 15546 + }, + { + "epoch": 4.204164413196322, + "grad_norm": 3.96875, + "learning_rate": 0.0031704132910566653, + "loss": 2.7539, + "mean_token_accuracy": 0.46566593647003174, + "num_tokens": 7947248060.0, + "step": 15547 + }, + { + "epoch": 4.204434829637642, + "grad_norm": 3.5625, + "learning_rate": 0.0031696359927345197, + "loss": 2.9598, + "mean_token_accuracy": 0.4317690134048462, + "num_tokens": 7947724042.0, + "step": 15548 + }, + { + "epoch": 4.204705246078961, + "grad_norm": 4.09375, + "learning_rate": 0.003168858934670972, + "loss": 2.8469, + "mean_token_accuracy": 0.46151095628738403, + "num_tokens": 7948110369.0, + "step": 15549 + }, + { + "epoch": 4.204975662520281, + "grad_norm": 3.046875, + "learning_rate": 0.003168082116889862, + "loss": 2.7427, + "mean_token_accuracy": 0.4613233208656311, + "num_tokens": 7948634486.0, + "step": 15550 + }, + { + "epoch": 4.205246078961601, + "grad_norm": 7.0, + "learning_rate": 0.0031673055394150222, + "loss": 8.3799, + "mean_token_accuracy": 0.039579153060913086, + "num_tokens": 7949143243.0, + "step": 15551 + }, + { + "epoch": 4.205516495402921, + "grad_norm": 6.375, + "learning_rate": 0.003166529202270285, + "loss": 2.7652, + "mean_token_accuracy": 0.4354155659675598, + "num_tokens": 7949667510.0, + "step": 15552 + }, + { + "epoch": 4.20578691184424, + "grad_norm": 2.328125, + "learning_rate": 0.0031657531054794697, + "loss": 2.8935, + "mean_token_accuracy": 0.43194687366485596, + "num_tokens": 7950191569.0, + "step": 15553 + }, + { + "epoch": 4.20605732828556, + "grad_norm": 3.1875, + "learning_rate": 0.0031649772490663854, + "loss": 2.7207, + "mean_token_accuracy": 0.44881653785705566, + "num_tokens": 7950715847.0, + "step": 15554 + }, + { + "epoch": 4.206327744726879, + "grad_norm": 3.0, + "learning_rate": 0.003164201633054843, + "loss": 2.8451, + "mean_token_accuracy": 0.45264720916748047, + "num_tokens": 7951126847.0, + "step": 15555 + }, + { + "epoch": 4.206598161168199, + "grad_norm": 3.140625, + "learning_rate": 0.003163426257468637, + "loss": 2.7696, + "mean_token_accuracy": 0.4511146545410156, + "num_tokens": 7951650897.0, + "step": 15556 + }, + { + "epoch": 4.2068685776095185, + "grad_norm": 3.484375, + "learning_rate": 0.003162651122331558, + "loss": 2.6925, + "mean_token_accuracy": 0.44097644090652466, + "num_tokens": 7952175132.0, + "step": 15557 + }, + { + "epoch": 4.207138994050839, + "grad_norm": 2.734375, + "learning_rate": 0.003161876227667393, + "loss": 2.7074, + "mean_token_accuracy": 0.471168577671051, + "num_tokens": 7952645763.0, + "step": 15558 + }, + { + "epoch": 4.207409410492158, + "grad_norm": 3.40625, + "learning_rate": 0.003161101573499912, + "loss": 2.6415, + "mean_token_accuracy": 0.4538237452507019, + "num_tokens": 7953169994.0, + "step": 15559 + }, + { + "epoch": 4.207679826933478, + "grad_norm": 3.0, + "learning_rate": 0.0031603271598528893, + "loss": 2.9543, + "mean_token_accuracy": 0.434789776802063, + "num_tokens": 7953694173.0, + "step": 15560 + }, + { + "epoch": 4.207950243374797, + "grad_norm": 4.125, + "learning_rate": 0.003159552986750086, + "loss": 2.8254, + "mean_token_accuracy": 0.43142253160476685, + "num_tokens": 7954218448.0, + "step": 15561 + }, + { + "epoch": 4.208220659816117, + "grad_norm": 3.40625, + "learning_rate": 0.003158779054215254, + "loss": 2.8129, + "mean_token_accuracy": 0.4723934531211853, + "num_tokens": 7954662403.0, + "step": 15562 + }, + { + "epoch": 4.208491076257436, + "grad_norm": 4.34375, + "learning_rate": 0.003158005362272138, + "loss": 2.8583, + "mean_token_accuracy": 0.43845221400260925, + "num_tokens": 7955186533.0, + "step": 15563 + }, + { + "epoch": 4.208761492698756, + "grad_norm": 3.328125, + "learning_rate": 0.0031572319109444812, + "loss": 2.8669, + "mean_token_accuracy": 0.4794599413871765, + "num_tokens": 7955604278.0, + "step": 15564 + }, + { + "epoch": 4.209031909140076, + "grad_norm": 33.0, + "learning_rate": 0.003156458700256014, + "loss": 2.791, + "mean_token_accuracy": 0.478185772895813, + "num_tokens": 7956088450.0, + "step": 15565 + }, + { + "epoch": 4.209302325581396, + "grad_norm": 6.3125, + "learning_rate": 0.0031556857302304587, + "loss": 2.9817, + "mean_token_accuracy": 0.4280049204826355, + "num_tokens": 7956596150.0, + "step": 15566 + }, + { + "epoch": 4.209572742022715, + "grad_norm": 3.09375, + "learning_rate": 0.0031549130008915343, + "loss": 2.8583, + "mean_token_accuracy": 0.43193912506103516, + "num_tokens": 7957120261.0, + "step": 15567 + }, + { + "epoch": 4.209843158464035, + "grad_norm": 4.1875, + "learning_rate": 0.0031541405122629497, + "loss": 2.7651, + "mean_token_accuracy": 0.44654470682144165, + "num_tokens": 7957638743.0, + "step": 15568 + }, + { + "epoch": 4.210113574905354, + "grad_norm": 4.625, + "learning_rate": 0.003153368264368408, + "loss": 2.8646, + "mean_token_accuracy": 0.41310444474220276, + "num_tokens": 7958162739.0, + "step": 15569 + }, + { + "epoch": 4.210383991346674, + "grad_norm": 3.21875, + "learning_rate": 0.003152596257231603, + "loss": 2.6849, + "mean_token_accuracy": 0.44486182928085327, + "num_tokens": 7958686929.0, + "step": 15570 + }, + { + "epoch": 4.2106544077879935, + "grad_norm": 190.0, + "learning_rate": 0.0031518244908762217, + "loss": 10.1281, + "mean_token_accuracy": 0.006683455780148506, + "num_tokens": 7959211094.0, + "step": 15571 + }, + { + "epoch": 4.210924824229313, + "grad_norm": 7.65625, + "learning_rate": 0.003151052965325947, + "loss": 2.9164, + "mean_token_accuracy": 0.41683700680732727, + "num_tokens": 7959735106.0, + "step": 15572 + }, + { + "epoch": 4.211195240670633, + "grad_norm": 2.078125, + "learning_rate": 0.0031502816806044465, + "loss": 2.933, + "mean_token_accuracy": 0.42454713582992554, + "num_tokens": 7960259322.0, + "step": 15573 + }, + { + "epoch": 4.211465657111952, + "grad_norm": 2.703125, + "learning_rate": 0.0031495106367353913, + "loss": 2.7954, + "mean_token_accuracy": 0.450389564037323, + "num_tokens": 7960783363.0, + "step": 15574 + }, + { + "epoch": 4.211736073553272, + "grad_norm": 3.09375, + "learning_rate": 0.003148739833742435, + "loss": 2.6335, + "mean_token_accuracy": 0.45462560653686523, + "num_tokens": 7961307558.0, + "step": 15575 + }, + { + "epoch": 4.212006489994591, + "grad_norm": 2.984375, + "learning_rate": 0.0031479692716492313, + "loss": 2.7503, + "mean_token_accuracy": 0.4474453330039978, + "num_tokens": 7961798866.0, + "step": 15576 + }, + { + "epoch": 4.212276906435911, + "grad_norm": 3.578125, + "learning_rate": 0.0031471989504794195, + "loss": 2.7449, + "mean_token_accuracy": 0.4429421126842499, + "num_tokens": 7962323112.0, + "step": 15577 + }, + { + "epoch": 4.2125473228772305, + "grad_norm": 3.078125, + "learning_rate": 0.003146428870256639, + "loss": 2.8489, + "mean_token_accuracy": 0.4355979561805725, + "num_tokens": 7962847392.0, + "step": 15578 + }, + { + "epoch": 4.212817739318551, + "grad_norm": 3.234375, + "learning_rate": 0.003145659031004517, + "loss": 2.8331, + "mean_token_accuracy": 0.44151735305786133, + "num_tokens": 7963363212.0, + "step": 15579 + }, + { + "epoch": 4.21308815575987, + "grad_norm": 3.28125, + "learning_rate": 0.003144889432746673, + "loss": 2.9201, + "mean_token_accuracy": 0.42879700660705566, + "num_tokens": 7963887338.0, + "step": 15580 + }, + { + "epoch": 4.21335857220119, + "grad_norm": 3.390625, + "learning_rate": 0.0031441200755067213, + "loss": 2.7247, + "mean_token_accuracy": 0.44767123460769653, + "num_tokens": 7964411540.0, + "step": 15581 + }, + { + "epoch": 4.213628988642509, + "grad_norm": 3.0, + "learning_rate": 0.0031433509593082705, + "loss": 2.7719, + "mean_token_accuracy": 0.45346593856811523, + "num_tokens": 7964892303.0, + "step": 15582 + }, + { + "epoch": 4.213899405083829, + "grad_norm": 4.15625, + "learning_rate": 0.003142582084174914, + "loss": 2.7348, + "mean_token_accuracy": 0.44674989581108093, + "num_tokens": 7965416530.0, + "step": 15583 + }, + { + "epoch": 4.214169821525148, + "grad_norm": 3.6875, + "learning_rate": 0.003141813450130249, + "loss": 2.8268, + "mean_token_accuracy": 0.41826945543289185, + "num_tokens": 7965940790.0, + "step": 15584 + }, + { + "epoch": 4.2144402379664685, + "grad_norm": 3.375, + "learning_rate": 0.0031410450571978566, + "loss": 2.6873, + "mean_token_accuracy": 0.45079007744789124, + "num_tokens": 7966371628.0, + "step": 15585 + }, + { + "epoch": 4.214710654407788, + "grad_norm": 2.921875, + "learning_rate": 0.0031402769054013112, + "loss": 2.7904, + "mean_token_accuracy": 0.45141667127609253, + "num_tokens": 7966895754.0, + "step": 15586 + }, + { + "epoch": 4.214981070849108, + "grad_norm": 3.515625, + "learning_rate": 0.0031395089947641847, + "loss": 2.7956, + "mean_token_accuracy": 0.44545847177505493, + "num_tokens": 7967420027.0, + "step": 15587 + }, + { + "epoch": 4.215251487290427, + "grad_norm": 4.03125, + "learning_rate": 0.0031387413253100398, + "loss": 2.9878, + "mean_token_accuracy": 0.4264875650405884, + "num_tokens": 7967944257.0, + "step": 15588 + }, + { + "epoch": 4.215521903731747, + "grad_norm": 3.671875, + "learning_rate": 0.0031379738970624283, + "loss": 2.7445, + "mean_token_accuracy": 0.42947232723236084, + "num_tokens": 7968468446.0, + "step": 15589 + }, + { + "epoch": 4.215792320173066, + "grad_norm": 2.90625, + "learning_rate": 0.0031372067100448996, + "loss": 2.9213, + "mean_token_accuracy": 0.43262773752212524, + "num_tokens": 7968992599.0, + "step": 15590 + }, + { + "epoch": 4.216062736614386, + "grad_norm": 11.8125, + "learning_rate": 0.003136439764280991, + "loss": 8.6407, + "mean_token_accuracy": 0.013624262064695358, + "num_tokens": 7969516796.0, + "step": 15591 + }, + { + "epoch": 4.2163331530557056, + "grad_norm": 7.71875, + "learning_rate": 0.0031356730597942375, + "loss": 2.8254, + "mean_token_accuracy": 0.44516706466674805, + "num_tokens": 7970041035.0, + "step": 15592 + }, + { + "epoch": 4.216603569497026, + "grad_norm": 2.828125, + "learning_rate": 0.0031349065966081607, + "loss": 2.93, + "mean_token_accuracy": 0.42173391580581665, + "num_tokens": 7970565218.0, + "step": 15593 + }, + { + "epoch": 4.216873985938345, + "grad_norm": 3.578125, + "learning_rate": 0.0031341403747462784, + "loss": 2.8836, + "mean_token_accuracy": 0.42556560039520264, + "num_tokens": 7971089455.0, + "step": 15594 + }, + { + "epoch": 4.217144402379665, + "grad_norm": 3.234375, + "learning_rate": 0.0031333743942321035, + "loss": 2.9437, + "mean_token_accuracy": 0.42502379417419434, + "num_tokens": 7971613652.0, + "step": 15595 + }, + { + "epoch": 4.217414818820984, + "grad_norm": 4.53125, + "learning_rate": 0.003132608655089135, + "loss": 2.7331, + "mean_token_accuracy": 0.4537905752658844, + "num_tokens": 7972137913.0, + "step": 15596 + }, + { + "epoch": 4.217685235262304, + "grad_norm": 3.4375, + "learning_rate": 0.003131843157340869, + "loss": 2.7868, + "mean_token_accuracy": 0.45034512877464294, + "num_tokens": 7972599406.0, + "step": 15597 + }, + { + "epoch": 4.217955651703623, + "grad_norm": 4.28125, + "learning_rate": 0.003131077901010795, + "loss": 2.6651, + "mean_token_accuracy": 0.4336707890033722, + "num_tokens": 7973123643.0, + "step": 15598 + }, + { + "epoch": 4.2182260681449435, + "grad_norm": 3.21875, + "learning_rate": 0.003130312886122392, + "loss": 2.8924, + "mean_token_accuracy": 0.4277459979057312, + "num_tokens": 7973647915.0, + "step": 15599 + }, + { + "epoch": 4.218496484586263, + "grad_norm": 3.5625, + "learning_rate": 0.0031295481126991315, + "loss": 2.8817, + "mean_token_accuracy": 0.4411452114582062, + "num_tokens": 7974172069.0, + "step": 15600 + }, + { + "epoch": 4.218766901027583, + "grad_norm": 24.125, + "learning_rate": 0.003128783580764481, + "loss": 2.6011, + "mean_token_accuracy": 0.4480125904083252, + "num_tokens": 7974696250.0, + "step": 15601 + }, + { + "epoch": 4.219037317468902, + "grad_norm": 5.875, + "learning_rate": 0.003128019290341898, + "loss": 2.7417, + "mean_token_accuracy": 0.44326722621917725, + "num_tokens": 7975212513.0, + "step": 15602 + }, + { + "epoch": 4.219307733910222, + "grad_norm": 3.140625, + "learning_rate": 0.003127255241454831, + "loss": 2.6678, + "mean_token_accuracy": 0.453571617603302, + "num_tokens": 7975736783.0, + "step": 15603 + }, + { + "epoch": 4.219578150351541, + "grad_norm": 4.15625, + "learning_rate": 0.003126491434126728, + "loss": 2.7537, + "mean_token_accuracy": 0.4316551387310028, + "num_tokens": 7976260991.0, + "step": 15604 + }, + { + "epoch": 4.219848566792861, + "grad_norm": 4.0625, + "learning_rate": 0.003125727868381024, + "loss": 2.8796, + "mean_token_accuracy": 0.4246861934661865, + "num_tokens": 7976785196.0, + "step": 15605 + }, + { + "epoch": 4.2201189832341806, + "grad_norm": 3.8125, + "learning_rate": 0.003124964544241142, + "loss": 2.9299, + "mean_token_accuracy": 0.41418522596359253, + "num_tokens": 7977309319.0, + "step": 15606 + }, + { + "epoch": 4.220389399675501, + "grad_norm": 3.703125, + "learning_rate": 0.00312420146173051, + "loss": 2.7592, + "mean_token_accuracy": 0.4279920756816864, + "num_tokens": 7977833442.0, + "step": 15607 + }, + { + "epoch": 4.22065981611682, + "grad_norm": 2.546875, + "learning_rate": 0.0031234386208725366, + "loss": 2.9053, + "mean_token_accuracy": 0.43734127283096313, + "num_tokens": 7978357727.0, + "step": 15608 + }, + { + "epoch": 4.22093023255814, + "grad_norm": 3.40625, + "learning_rate": 0.0031226760216906293, + "loss": 2.8284, + "mean_token_accuracy": 0.41646647453308105, + "num_tokens": 7978860770.0, + "step": 15609 + }, + { + "epoch": 4.221200648999459, + "grad_norm": 3.125, + "learning_rate": 0.003121913664208189, + "loss": 2.8713, + "mean_token_accuracy": 0.44526636600494385, + "num_tokens": 7979332393.0, + "step": 15610 + }, + { + "epoch": 4.221471065440779, + "grad_norm": 65.5, + "learning_rate": 0.003121151548448604, + "loss": 8.4415, + "mean_token_accuracy": 0.06035866588354111, + "num_tokens": 7979856655.0, + "step": 15611 + }, + { + "epoch": 4.221741481882098, + "grad_norm": 6.78125, + "learning_rate": 0.00312038967443526, + "loss": 2.8747, + "mean_token_accuracy": 0.44046756625175476, + "num_tokens": 7980347681.0, + "step": 15612 + }, + { + "epoch": 4.222011898323418, + "grad_norm": 5.59375, + "learning_rate": 0.0031196280421915334, + "loss": 2.873, + "mean_token_accuracy": 0.43723854422569275, + "num_tokens": 7980871929.0, + "step": 15613 + }, + { + "epoch": 4.222282314764738, + "grad_norm": 3.5625, + "learning_rate": 0.003118866651740792, + "loss": 2.7712, + "mean_token_accuracy": 0.43432295322418213, + "num_tokens": 7981337469.0, + "step": 15614 + }, + { + "epoch": 4.222552731206058, + "grad_norm": 3.21875, + "learning_rate": 0.0031181055031064, + "loss": 2.7324, + "mean_token_accuracy": 0.44440194964408875, + "num_tokens": 7981861721.0, + "step": 15615 + }, + { + "epoch": 4.222823147647377, + "grad_norm": 3.953125, + "learning_rate": 0.0031173445963117094, + "loss": 2.8233, + "mean_token_accuracy": 0.43926048278808594, + "num_tokens": 7982385824.0, + "step": 15616 + }, + { + "epoch": 4.223093564088696, + "grad_norm": 4.15625, + "learning_rate": 0.0031165839313800677, + "loss": 2.7112, + "mean_token_accuracy": 0.459882915019989, + "num_tokens": 7982892116.0, + "step": 15617 + }, + { + "epoch": 4.223363980530016, + "grad_norm": 3.21875, + "learning_rate": 0.0031158235083348167, + "loss": 2.4921, + "mean_token_accuracy": 0.46856003999710083, + "num_tokens": 7983386756.0, + "step": 15618 + }, + { + "epoch": 4.2236343969713355, + "grad_norm": 3.484375, + "learning_rate": 0.003115063327199285, + "loss": 2.9039, + "mean_token_accuracy": 0.4397394359111786, + "num_tokens": 7983911028.0, + "step": 15619 + }, + { + "epoch": 4.223904813412656, + "grad_norm": 3.578125, + "learning_rate": 0.003114303387996798, + "loss": 2.8584, + "mean_token_accuracy": 0.44023311138153076, + "num_tokens": 7984435238.0, + "step": 15620 + }, + { + "epoch": 4.224175229853975, + "grad_norm": 3.890625, + "learning_rate": 0.0031135436907506744, + "loss": 2.5832, + "mean_token_accuracy": 0.44592976570129395, + "num_tokens": 7984959516.0, + "step": 15621 + }, + { + "epoch": 4.224445646295295, + "grad_norm": 3.1875, + "learning_rate": 0.0031127842354842232, + "loss": 2.5934, + "mean_token_accuracy": 0.45552870631217957, + "num_tokens": 7985448342.0, + "step": 15622 + }, + { + "epoch": 4.224716062736614, + "grad_norm": 3.171875, + "learning_rate": 0.003112025022220745, + "loss": 2.8881, + "mean_token_accuracy": 0.4346392750740051, + "num_tokens": 7985972609.0, + "step": 15623 + }, + { + "epoch": 4.224986479177934, + "grad_norm": 3.765625, + "learning_rate": 0.0031112660509835374, + "loss": 2.8354, + "mean_token_accuracy": 0.4406174421310425, + "num_tokens": 7986496775.0, + "step": 15624 + }, + { + "epoch": 4.225256895619253, + "grad_norm": 3.421875, + "learning_rate": 0.003110507321795887, + "loss": 2.8431, + "mean_token_accuracy": 0.4416149854660034, + "num_tokens": 7986961766.0, + "step": 15625 + }, + { + "epoch": 4.225527312060573, + "grad_norm": 3.546875, + "learning_rate": 0.003109748834681071, + "loss": 2.8393, + "mean_token_accuracy": 0.4354640245437622, + "num_tokens": 7987456688.0, + "step": 15626 + }, + { + "epoch": 4.225797728501893, + "grad_norm": 4.09375, + "learning_rate": 0.003108990589662367, + "loss": 2.5876, + "mean_token_accuracy": 0.4537563920021057, + "num_tokens": 7987980817.0, + "step": 15627 + }, + { + "epoch": 4.226068144943213, + "grad_norm": 2.859375, + "learning_rate": 0.0031082325867630377, + "loss": 2.7149, + "mean_token_accuracy": 0.44232815504074097, + "num_tokens": 7988505094.0, + "step": 15628 + }, + { + "epoch": 4.226338561384532, + "grad_norm": 3.703125, + "learning_rate": 0.003107474826006339, + "loss": 2.8906, + "mean_token_accuracy": 0.4341852068901062, + "num_tokens": 7988991866.0, + "step": 15629 + }, + { + "epoch": 4.226608977825852, + "grad_norm": 4.15625, + "learning_rate": 0.003106717307415522, + "loss": 2.9162, + "mean_token_accuracy": 0.4301096200942993, + "num_tokens": 7989516066.0, + "step": 15630 + }, + { + "epoch": 4.226879394267171, + "grad_norm": 60.75, + "learning_rate": 0.0031059600310138335, + "loss": 7.9799, + "mean_token_accuracy": 0.02891455963253975, + "num_tokens": 7989997998.0, + "step": 15631 + }, + { + "epoch": 4.227149810708491, + "grad_norm": 7.90625, + "learning_rate": 0.0031052029968245036, + "loss": 3.1082, + "mean_token_accuracy": 0.38953274488449097, + "num_tokens": 7990522212.0, + "step": 15632 + }, + { + "epoch": 4.2274202271498105, + "grad_norm": 3.265625, + "learning_rate": 0.0031044462048707656, + "loss": 2.7377, + "mean_token_accuracy": 0.4273524880409241, + "num_tokens": 7991046420.0, + "step": 15633 + }, + { + "epoch": 4.227690643591131, + "grad_norm": 4.09375, + "learning_rate": 0.0031036896551758365, + "loss": 2.7994, + "mean_token_accuracy": 0.4389806091785431, + "num_tokens": 7991512595.0, + "step": 15634 + }, + { + "epoch": 4.22796106003245, + "grad_norm": 2.953125, + "learning_rate": 0.0031029333477629283, + "loss": 2.7638, + "mean_token_accuracy": 0.44830185174942017, + "num_tokens": 7992017526.0, + "step": 15635 + }, + { + "epoch": 4.22823147647377, + "grad_norm": 4.15625, + "learning_rate": 0.0031021772826552514, + "loss": 2.934, + "mean_token_accuracy": 0.4269734025001526, + "num_tokens": 7992541794.0, + "step": 15636 + }, + { + "epoch": 4.228501892915089, + "grad_norm": 4.1875, + "learning_rate": 0.0031014214598759997, + "loss": 3.0111, + "mean_token_accuracy": 0.40647315979003906, + "num_tokens": 7993066001.0, + "step": 15637 + }, + { + "epoch": 4.228772309356409, + "grad_norm": 4.21875, + "learning_rate": 0.003100665879448367, + "loss": 2.7218, + "mean_token_accuracy": 0.45275187492370605, + "num_tokens": 7993540149.0, + "step": 15638 + }, + { + "epoch": 4.229042725797728, + "grad_norm": 4.0, + "learning_rate": 0.0030999105413955365, + "loss": 2.8286, + "mean_token_accuracy": 0.4290238618850708, + "num_tokens": 7994064331.0, + "step": 15639 + }, + { + "epoch": 4.229313142239048, + "grad_norm": 3.328125, + "learning_rate": 0.00309915544574068, + "loss": 2.7642, + "mean_token_accuracy": 0.4388929009437561, + "num_tokens": 7994588510.0, + "step": 15640 + }, + { + "epoch": 4.229583558680368, + "grad_norm": 3.625, + "learning_rate": 0.0030984005925069713, + "loss": 2.761, + "mean_token_accuracy": 0.45014688372612, + "num_tokens": 7995112661.0, + "step": 15641 + }, + { + "epoch": 4.229853975121688, + "grad_norm": 3.859375, + "learning_rate": 0.00309764598171757, + "loss": 2.9831, + "mean_token_accuracy": 0.42294567823410034, + "num_tokens": 7995636897.0, + "step": 15642 + }, + { + "epoch": 4.230124391563007, + "grad_norm": 3.25, + "learning_rate": 0.003096891613395628, + "loss": 2.6653, + "mean_token_accuracy": 0.4637932777404785, + "num_tokens": 7996161101.0, + "step": 15643 + }, + { + "epoch": 4.230394808004327, + "grad_norm": 3.609375, + "learning_rate": 0.0030961374875642946, + "loss": 2.8022, + "mean_token_accuracy": 0.44875502586364746, + "num_tokens": 7996659116.0, + "step": 15644 + }, + { + "epoch": 4.230665224445646, + "grad_norm": 3.703125, + "learning_rate": 0.0030953836042467045, + "loss": 2.91, + "mean_token_accuracy": 0.4506458044052124, + "num_tokens": 7997183214.0, + "step": 15645 + }, + { + "epoch": 4.230935640886966, + "grad_norm": 3.484375, + "learning_rate": 0.0030946299634659912, + "loss": 2.765, + "mean_token_accuracy": 0.44367820024490356, + "num_tokens": 7997707403.0, + "step": 15646 + }, + { + "epoch": 4.2312060573282855, + "grad_norm": 3.359375, + "learning_rate": 0.0030938765652452804, + "loss": 2.7335, + "mean_token_accuracy": 0.45008742809295654, + "num_tokens": 7998224700.0, + "step": 15647 + }, + { + "epoch": 4.231476473769606, + "grad_norm": 3.546875, + "learning_rate": 0.0030931234096076878, + "loss": 2.8837, + "mean_token_accuracy": 0.42880311608314514, + "num_tokens": 7998748970.0, + "step": 15648 + }, + { + "epoch": 4.231746890210925, + "grad_norm": 3.40625, + "learning_rate": 0.0030923704965763192, + "loss": 2.7437, + "mean_token_accuracy": 0.46055665612220764, + "num_tokens": 7999223889.0, + "step": 15649 + }, + { + "epoch": 4.232017306652245, + "grad_norm": 3.375, + "learning_rate": 0.003091617826174281, + "loss": 2.9127, + "mean_token_accuracy": 0.4376169741153717, + "num_tokens": 7999715646.0, + "step": 15650 + }, + { + "epoch": 4.232287723093564, + "grad_norm": 43.5, + "learning_rate": 0.0030908653984246647, + "loss": 6.1007, + "mean_token_accuracy": 0.1474369466304779, + "num_tokens": 8000153424.0, + "step": 15651 + }, + { + "epoch": 4.232558139534884, + "grad_norm": 6.40625, + "learning_rate": 0.003090113213350555, + "loss": 2.8075, + "mean_token_accuracy": 0.43754100799560547, + "num_tokens": 8000677635.0, + "step": 15652 + }, + { + "epoch": 4.232828555976203, + "grad_norm": 2.96875, + "learning_rate": 0.0030893612709750346, + "loss": 2.8936, + "mean_token_accuracy": 0.41664037108421326, + "num_tokens": 8001201854.0, + "step": 15653 + }, + { + "epoch": 4.2330989724175225, + "grad_norm": 2.984375, + "learning_rate": 0.0030886095713211752, + "loss": 2.8199, + "mean_token_accuracy": 0.41930127143859863, + "num_tokens": 8001720859.0, + "step": 15654 + }, + { + "epoch": 4.233369388858843, + "grad_norm": 2.890625, + "learning_rate": 0.0030878581144120378, + "loss": 2.7353, + "mean_token_accuracy": 0.4470909535884857, + "num_tokens": 8002245081.0, + "step": 15655 + }, + { + "epoch": 4.233639805300163, + "grad_norm": 4.25, + "learning_rate": 0.0030871069002706836, + "loss": 2.8145, + "mean_token_accuracy": 0.42165321111679077, + "num_tokens": 8002769294.0, + "step": 15656 + }, + { + "epoch": 4.233910221741482, + "grad_norm": 3.65625, + "learning_rate": 0.0030863559289201585, + "loss": 2.8813, + "mean_token_accuracy": 0.42140907049179077, + "num_tokens": 8003293426.0, + "step": 15657 + }, + { + "epoch": 4.234180638182801, + "grad_norm": 3.75, + "learning_rate": 0.003085605200383508, + "loss": 2.8893, + "mean_token_accuracy": 0.43260663747787476, + "num_tokens": 8003817674.0, + "step": 15658 + }, + { + "epoch": 4.234451054624121, + "grad_norm": 4.21875, + "learning_rate": 0.0030848547146837615, + "loss": 2.9286, + "mean_token_accuracy": 0.43738752603530884, + "num_tokens": 8004341942.0, + "step": 15659 + }, + { + "epoch": 4.23472147106544, + "grad_norm": 3.8125, + "learning_rate": 0.0030841044718439505, + "loss": 2.7789, + "mean_token_accuracy": 0.4282543361186981, + "num_tokens": 8004866125.0, + "step": 15660 + }, + { + "epoch": 4.2349918875067605, + "grad_norm": 3.234375, + "learning_rate": 0.003083354471887095, + "loss": 2.7735, + "mean_token_accuracy": 0.45065122842788696, + "num_tokens": 8005390160.0, + "step": 15661 + }, + { + "epoch": 4.23526230394808, + "grad_norm": 3.21875, + "learning_rate": 0.0030826047148362046, + "loss": 2.7677, + "mean_token_accuracy": 0.4499633014202118, + "num_tokens": 8005914409.0, + "step": 15662 + }, + { + "epoch": 4.2355327203894, + "grad_norm": 3.71875, + "learning_rate": 0.0030818552007142847, + "loss": 2.8297, + "mean_token_accuracy": 0.4199226498603821, + "num_tokens": 8006438583.0, + "step": 15663 + }, + { + "epoch": 4.235803136830719, + "grad_norm": 3.46875, + "learning_rate": 0.003081105929544334, + "loss": 2.7976, + "mean_token_accuracy": 0.4475662112236023, + "num_tokens": 8006921567.0, + "step": 15664 + }, + { + "epoch": 4.236073553272039, + "grad_norm": 3.84375, + "learning_rate": 0.0030803569013493418, + "loss": 2.8738, + "mean_token_accuracy": 0.42944255471229553, + "num_tokens": 8007445815.0, + "step": 15665 + }, + { + "epoch": 4.236343969713358, + "grad_norm": 3.921875, + "learning_rate": 0.0030796081161522876, + "loss": 2.7663, + "mean_token_accuracy": 0.4373827576637268, + "num_tokens": 8007970082.0, + "step": 15666 + }, + { + "epoch": 4.236614386154678, + "grad_norm": 3.703125, + "learning_rate": 0.0030788595739761507, + "loss": 2.6315, + "mean_token_accuracy": 0.4466674327850342, + "num_tokens": 8008436975.0, + "step": 15667 + }, + { + "epoch": 4.2368848025959975, + "grad_norm": 3.46875, + "learning_rate": 0.0030781112748438973, + "loss": 2.6412, + "mean_token_accuracy": 0.4663916230201721, + "num_tokens": 8008923770.0, + "step": 15668 + }, + { + "epoch": 4.237155219037318, + "grad_norm": 4.46875, + "learning_rate": 0.0030773632187784845, + "loss": 2.8191, + "mean_token_accuracy": 0.42568710446357727, + "num_tokens": 8009447931.0, + "step": 15669 + }, + { + "epoch": 4.237425635478637, + "grad_norm": 4.28125, + "learning_rate": 0.0030766154058028683, + "loss": 2.8212, + "mean_token_accuracy": 0.4306434392929077, + "num_tokens": 8009972194.0, + "step": 15670 + }, + { + "epoch": 4.237696051919957, + "grad_norm": 55.5, + "learning_rate": 0.003075867835939994, + "loss": 5.4425, + "mean_token_accuracy": 0.20635372400283813, + "num_tokens": 8010496353.0, + "step": 15671 + }, + { + "epoch": 4.237966468361276, + "grad_norm": 7.4375, + "learning_rate": 0.0030751205092127954, + "loss": 2.8405, + "mean_token_accuracy": 0.44755494594573975, + "num_tokens": 8011020546.0, + "step": 15672 + }, + { + "epoch": 4.238236884802596, + "grad_norm": 2.546875, + "learning_rate": 0.003074373425644205, + "loss": 2.7979, + "mean_token_accuracy": 0.4432834982872009, + "num_tokens": 8011544747.0, + "step": 15673 + }, + { + "epoch": 4.238507301243915, + "grad_norm": 2.9375, + "learning_rate": 0.0030736265852571473, + "loss": 2.7828, + "mean_token_accuracy": 0.4435041546821594, + "num_tokens": 8012068977.0, + "step": 15674 + }, + { + "epoch": 4.2387777176852355, + "grad_norm": 3.546875, + "learning_rate": 0.003072879988074534, + "loss": 2.9187, + "mean_token_accuracy": 0.4407016336917877, + "num_tokens": 8012593154.0, + "step": 15675 + }, + { + "epoch": 4.239048134126555, + "grad_norm": 3.8125, + "learning_rate": 0.003072133634119276, + "loss": 2.6509, + "mean_token_accuracy": 0.4406387209892273, + "num_tokens": 8013117409.0, + "step": 15676 + }, + { + "epoch": 4.239318550567875, + "grad_norm": 3.1875, + "learning_rate": 0.0030713875234142742, + "loss": 2.8409, + "mean_token_accuracy": 0.4608795642852783, + "num_tokens": 8013580937.0, + "step": 15677 + }, + { + "epoch": 4.239588967009194, + "grad_norm": 3.484375, + "learning_rate": 0.0030706416559824166, + "loss": 2.5868, + "mean_token_accuracy": 0.5111615061759949, + "num_tokens": 8014041620.0, + "step": 15678 + }, + { + "epoch": 4.239859383450514, + "grad_norm": 3.78125, + "learning_rate": 0.003069896031846594, + "loss": 2.8668, + "mean_token_accuracy": 0.4332718849182129, + "num_tokens": 8014555127.0, + "step": 15679 + }, + { + "epoch": 4.240129799891833, + "grad_norm": 3.875, + "learning_rate": 0.0030691506510296825, + "loss": 2.8524, + "mean_token_accuracy": 0.4337978959083557, + "num_tokens": 8015079261.0, + "step": 15680 + }, + { + "epoch": 4.240400216333153, + "grad_norm": 3.546875, + "learning_rate": 0.0030684055135545493, + "loss": 2.7793, + "mean_token_accuracy": 0.4500667154788971, + "num_tokens": 8015545756.0, + "step": 15681 + }, + { + "epoch": 4.2406706327744725, + "grad_norm": 3.3125, + "learning_rate": 0.0030676606194440627, + "loss": 2.963, + "mean_token_accuracy": 0.43363046646118164, + "num_tokens": 8016070021.0, + "step": 15682 + }, + { + "epoch": 4.240941049215793, + "grad_norm": 3.828125, + "learning_rate": 0.0030669159687210733, + "loss": 2.6988, + "mean_token_accuracy": 0.44420307874679565, + "num_tokens": 8016594280.0, + "step": 15683 + }, + { + "epoch": 4.241211465657112, + "grad_norm": 3.515625, + "learning_rate": 0.003066171561408434, + "loss": 2.6546, + "mean_token_accuracy": 0.45808857679367065, + "num_tokens": 8017118485.0, + "step": 15684 + }, + { + "epoch": 4.241481882098432, + "grad_norm": 3.53125, + "learning_rate": 0.003065427397528983, + "loss": 2.7235, + "mean_token_accuracy": 0.435239315032959, + "num_tokens": 8017642763.0, + "step": 15685 + }, + { + "epoch": 4.241752298539751, + "grad_norm": 4.03125, + "learning_rate": 0.003064683477105552, + "loss": 2.9488, + "mean_token_accuracy": 0.43107277154922485, + "num_tokens": 8018167041.0, + "step": 15686 + }, + { + "epoch": 4.242022714981071, + "grad_norm": 5.34375, + "learning_rate": 0.0030639398001609673, + "loss": 2.9114, + "mean_token_accuracy": 0.43519920110702515, + "num_tokens": 8018691256.0, + "step": 15687 + }, + { + "epoch": 4.24229313142239, + "grad_norm": 3.328125, + "learning_rate": 0.0030631963667180507, + "loss": 2.7161, + "mean_token_accuracy": 0.4512246251106262, + "num_tokens": 8019215349.0, + "step": 15688 + }, + { + "epoch": 4.2425635478637105, + "grad_norm": 3.828125, + "learning_rate": 0.0030624531767996085, + "loss": 2.8061, + "mean_token_accuracy": 0.44995787739753723, + "num_tokens": 8019681880.0, + "step": 15689 + }, + { + "epoch": 4.24283396430503, + "grad_norm": 3.875, + "learning_rate": 0.0030617102304284468, + "loss": 2.7893, + "mean_token_accuracy": 0.44520363211631775, + "num_tokens": 8020206111.0, + "step": 15690 + }, + { + "epoch": 4.24310438074635, + "grad_norm": 126.5, + "learning_rate": 0.0030609675276273606, + "loss": 8.0838, + "mean_token_accuracy": 0.10319438576698303, + "num_tokens": 8020704417.0, + "step": 15691 + }, + { + "epoch": 4.243374797187669, + "grad_norm": 8.25, + "learning_rate": 0.0030602250684191356, + "loss": 2.9559, + "mean_token_accuracy": 0.4238792955875397, + "num_tokens": 8021149580.0, + "step": 15692 + }, + { + "epoch": 4.243645213628989, + "grad_norm": 2.59375, + "learning_rate": 0.0030594828528265577, + "loss": 2.8841, + "mean_token_accuracy": 0.42886751890182495, + "num_tokens": 8021673844.0, + "step": 15693 + }, + { + "epoch": 4.243915630070308, + "grad_norm": 4.46875, + "learning_rate": 0.003058740880872396, + "loss": 2.601, + "mean_token_accuracy": 0.44357699155807495, + "num_tokens": 8022198126.0, + "step": 15694 + }, + { + "epoch": 4.2441860465116275, + "grad_norm": 3.921875, + "learning_rate": 0.003057999152579417, + "loss": 2.759, + "mean_token_accuracy": 0.4509318470954895, + "num_tokens": 8022722149.0, + "step": 15695 + }, + { + "epoch": 4.2444564629529475, + "grad_norm": 5.71875, + "learning_rate": 0.0030572576679703816, + "loss": 3.1171, + "mean_token_accuracy": 0.4128682613372803, + "num_tokens": 8023246359.0, + "step": 15696 + }, + { + "epoch": 4.244726879394268, + "grad_norm": 4.1875, + "learning_rate": 0.003056516427068038, + "loss": 2.5922, + "mean_token_accuracy": 0.4534986913204193, + "num_tokens": 8023770537.0, + "step": 15697 + }, + { + "epoch": 4.244997295835587, + "grad_norm": 3.328125, + "learning_rate": 0.00305577542989513, + "loss": 2.8393, + "mean_token_accuracy": 0.4303264617919922, + "num_tokens": 8024294822.0, + "step": 15698 + }, + { + "epoch": 4.245267712276906, + "grad_norm": 3.609375, + "learning_rate": 0.0030550346764743963, + "loss": 2.8579, + "mean_token_accuracy": 0.4240330755710602, + "num_tokens": 8024818951.0, + "step": 15699 + }, + { + "epoch": 4.245538128718226, + "grad_norm": 3.859375, + "learning_rate": 0.0030542941668285604, + "loss": 2.9242, + "mean_token_accuracy": 0.44506487250328064, + "num_tokens": 8025278830.0, + "step": 15700 + }, + { + "epoch": 4.245808545159545, + "grad_norm": 3.6875, + "learning_rate": 0.0030535539009803486, + "loss": 2.6888, + "mean_token_accuracy": 0.44930851459503174, + "num_tokens": 8025802986.0, + "step": 15701 + }, + { + "epoch": 4.246078961600865, + "grad_norm": 3.4375, + "learning_rate": 0.00305281387895247, + "loss": 2.8433, + "mean_token_accuracy": 0.44402214884757996, + "num_tokens": 8026327263.0, + "step": 15702 + }, + { + "epoch": 4.246349378042185, + "grad_norm": 3.78125, + "learning_rate": 0.0030520741007676357, + "loss": 2.7938, + "mean_token_accuracy": 0.42931050062179565, + "num_tokens": 8026851466.0, + "step": 15703 + }, + { + "epoch": 4.246619794483505, + "grad_norm": 2.9375, + "learning_rate": 0.003051334566448539, + "loss": 2.7299, + "mean_token_accuracy": 0.44422659277915955, + "num_tokens": 8027375698.0, + "step": 15704 + }, + { + "epoch": 4.246890210924824, + "grad_norm": 3.921875, + "learning_rate": 0.003050595276017874, + "loss": 2.9063, + "mean_token_accuracy": 0.4174020290374756, + "num_tokens": 8027899930.0, + "step": 15705 + }, + { + "epoch": 4.247160627366144, + "grad_norm": 3.609375, + "learning_rate": 0.003049856229498323, + "loss": 2.7831, + "mean_token_accuracy": 0.44029730558395386, + "num_tokens": 8028424122.0, + "step": 15706 + }, + { + "epoch": 4.247431043807463, + "grad_norm": 4.0, + "learning_rate": 0.003049117426912564, + "loss": 2.7671, + "mean_token_accuracy": 0.46603551506996155, + "num_tokens": 8028948386.0, + "step": 15707 + }, + { + "epoch": 4.247701460248783, + "grad_norm": 4.1875, + "learning_rate": 0.0030483788682832635, + "loss": 2.8288, + "mean_token_accuracy": 0.41248267889022827, + "num_tokens": 8029472488.0, + "step": 15708 + }, + { + "epoch": 4.2479718766901025, + "grad_norm": 3.734375, + "learning_rate": 0.0030476405536330833, + "loss": 2.8974, + "mean_token_accuracy": 0.43852442502975464, + "num_tokens": 8029996744.0, + "step": 15709 + }, + { + "epoch": 4.2482422931314225, + "grad_norm": 4.0625, + "learning_rate": 0.003046902482984678, + "loss": 2.7191, + "mean_token_accuracy": 0.43704450130462646, + "num_tokens": 8030475157.0, + "step": 15710 + }, + { + "epoch": 4.248512709572742, + "grad_norm": 67.5, + "learning_rate": 0.003046164656360694, + "loss": 4.6157, + "mean_token_accuracy": 0.27459636330604553, + "num_tokens": 8030999416.0, + "step": 15711 + }, + { + "epoch": 4.248783126014062, + "grad_norm": 8.75, + "learning_rate": 0.0030454270737837663, + "loss": 2.7544, + "mean_token_accuracy": 0.4375683069229126, + "num_tokens": 8031523641.0, + "step": 15712 + }, + { + "epoch": 4.249053542455381, + "grad_norm": 2.703125, + "learning_rate": 0.0030446897352765312, + "loss": 2.8904, + "mean_token_accuracy": 0.43672794103622437, + "num_tokens": 8032047816.0, + "step": 15713 + }, + { + "epoch": 4.249323958896701, + "grad_norm": 3.28125, + "learning_rate": 0.003043952640861608, + "loss": 2.8416, + "mean_token_accuracy": 0.42807215452194214, + "num_tokens": 8032571982.0, + "step": 15714 + }, + { + "epoch": 4.24959437533802, + "grad_norm": 3.640625, + "learning_rate": 0.003043215790561617, + "loss": 3.0191, + "mean_token_accuracy": 0.4161236584186554, + "num_tokens": 8033096222.0, + "step": 15715 + }, + { + "epoch": 4.24986479177934, + "grad_norm": 3.9375, + "learning_rate": 0.0030424791843991633, + "loss": 2.9625, + "mean_token_accuracy": 0.43521344661712646, + "num_tokens": 8033620486.0, + "step": 15716 + }, + { + "epoch": 4.25013520822066, + "grad_norm": 4.4375, + "learning_rate": 0.003041742822396851, + "loss": 2.8487, + "mean_token_accuracy": 0.44000929594039917, + "num_tokens": 8034144625.0, + "step": 15717 + }, + { + "epoch": 4.25040562466198, + "grad_norm": 3.671875, + "learning_rate": 0.0030410067045772717, + "loss": 2.873, + "mean_token_accuracy": 0.43715471029281616, + "num_tokens": 8034668709.0, + "step": 15718 + }, + { + "epoch": 4.250676041103299, + "grad_norm": 3.59375, + "learning_rate": 0.003040270830963014, + "loss": 2.7598, + "mean_token_accuracy": 0.44235432147979736, + "num_tokens": 8035147304.0, + "step": 15719 + }, + { + "epoch": 4.250946457544619, + "grad_norm": 3.90625, + "learning_rate": 0.0030395352015766554, + "loss": 2.8593, + "mean_token_accuracy": 0.45396703481674194, + "num_tokens": 8035624408.0, + "step": 15720 + }, + { + "epoch": 4.251216873985938, + "grad_norm": 3.234375, + "learning_rate": 0.0030387998164407644, + "loss": 2.7863, + "mean_token_accuracy": 0.43006476759910583, + "num_tokens": 8036148590.0, + "step": 15721 + }, + { + "epoch": 4.251487290427258, + "grad_norm": 3.15625, + "learning_rate": 0.00303806467557791, + "loss": 2.9206, + "mean_token_accuracy": 0.4316517114639282, + "num_tokens": 8036672816.0, + "step": 15722 + }, + { + "epoch": 4.2517577068685775, + "grad_norm": 7.25, + "learning_rate": 0.003037329779010646, + "loss": 2.6231, + "mean_token_accuracy": 0.45748981833457947, + "num_tokens": 8037180743.0, + "step": 15723 + }, + { + "epoch": 4.2520281233098975, + "grad_norm": 3.171875, + "learning_rate": 0.0030365951267615175, + "loss": 2.9302, + "mean_token_accuracy": 0.43116360902786255, + "num_tokens": 8037704958.0, + "step": 15724 + }, + { + "epoch": 4.252298539751217, + "grad_norm": 3.015625, + "learning_rate": 0.003035860718853073, + "loss": 2.6164, + "mean_token_accuracy": 0.44967058300971985, + "num_tokens": 8038229198.0, + "step": 15725 + }, + { + "epoch": 4.252568956192537, + "grad_norm": 3.171875, + "learning_rate": 0.003035126555307842, + "loss": 2.9986, + "mean_token_accuracy": 0.4546716809272766, + "num_tokens": 8038688518.0, + "step": 15726 + }, + { + "epoch": 4.252839372633856, + "grad_norm": 2.75, + "learning_rate": 0.003034392636148351, + "loss": 2.8574, + "mean_token_accuracy": 0.4463338255882263, + "num_tokens": 8039212768.0, + "step": 15727 + }, + { + "epoch": 4.253109789075176, + "grad_norm": 4.09375, + "learning_rate": 0.0030336589613971197, + "loss": 2.9158, + "mean_token_accuracy": 0.4440513253211975, + "num_tokens": 8039664241.0, + "step": 15728 + }, + { + "epoch": 4.253380205516495, + "grad_norm": 4.53125, + "learning_rate": 0.003032925531076657, + "loss": 2.5887, + "mean_token_accuracy": 0.44580745697021484, + "num_tokens": 8040188421.0, + "step": 15729 + }, + { + "epoch": 4.253650621957815, + "grad_norm": 4.09375, + "learning_rate": 0.0030321923452094686, + "loss": 2.7168, + "mean_token_accuracy": 0.43924829363822937, + "num_tokens": 8040712649.0, + "step": 15730 + }, + { + "epoch": 4.253921038399135, + "grad_norm": 24.25, + "learning_rate": 0.003031459403818053, + "loss": 2.3736, + "mean_token_accuracy": 0.5019532442092896, + "num_tokens": 8041195512.0, + "step": 15731 + }, + { + "epoch": 4.254191454840455, + "grad_norm": 9.25, + "learning_rate": 0.0030307267069248954, + "loss": 3.0391, + "mean_token_accuracy": 0.4244895279407501, + "num_tokens": 8041719724.0, + "step": 15732 + }, + { + "epoch": 4.254461871281774, + "grad_norm": 4.125, + "learning_rate": 0.0030299942545524796, + "loss": 2.7004, + "mean_token_accuracy": 0.4389292001724243, + "num_tokens": 8042243884.0, + "step": 15733 + }, + { + "epoch": 4.254732287723094, + "grad_norm": 3.71875, + "learning_rate": 0.0030292620467232793, + "loss": 2.7349, + "mean_token_accuracy": 0.4509176015853882, + "num_tokens": 8042767952.0, + "step": 15734 + }, + { + "epoch": 4.255002704164413, + "grad_norm": 3.484375, + "learning_rate": 0.0030285300834597567, + "loss": 2.8745, + "mean_token_accuracy": 0.4239038825035095, + "num_tokens": 8043292135.0, + "step": 15735 + }, + { + "epoch": 4.255273120605732, + "grad_norm": 3.921875, + "learning_rate": 0.003027798364784377, + "loss": 2.7144, + "mean_token_accuracy": 0.44534429907798767, + "num_tokens": 8043816410.0, + "step": 15736 + }, + { + "epoch": 4.2555435370470525, + "grad_norm": 3.578125, + "learning_rate": 0.0030270668907195868, + "loss": 2.8314, + "mean_token_accuracy": 0.43014585971832275, + "num_tokens": 8044340568.0, + "step": 15737 + }, + { + "epoch": 4.2558139534883725, + "grad_norm": 4.0625, + "learning_rate": 0.0030263356612878307, + "loss": 2.8446, + "mean_token_accuracy": 0.4294445812702179, + "num_tokens": 8044864762.0, + "step": 15738 + }, + { + "epoch": 4.256084369929692, + "grad_norm": 3.859375, + "learning_rate": 0.0030256046765115463, + "loss": 2.8005, + "mean_token_accuracy": 0.4286345839500427, + "num_tokens": 8045388976.0, + "step": 15739 + }, + { + "epoch": 4.256354786371011, + "grad_norm": 4.3125, + "learning_rate": 0.0030248739364131623, + "loss": 2.9671, + "mean_token_accuracy": 0.4290773272514343, + "num_tokens": 8045913190.0, + "step": 15740 + }, + { + "epoch": 4.256625202812331, + "grad_norm": 3.734375, + "learning_rate": 0.003024143441015097, + "loss": 2.7321, + "mean_token_accuracy": 0.44497939944267273, + "num_tokens": 8046395889.0, + "step": 15741 + }, + { + "epoch": 4.25689561925365, + "grad_norm": 4.34375, + "learning_rate": 0.0030234131903397687, + "loss": 2.719, + "mean_token_accuracy": 0.4329284429550171, + "num_tokens": 8046920086.0, + "step": 15742 + }, + { + "epoch": 4.25716603569497, + "grad_norm": 3.84375, + "learning_rate": 0.0030226831844095783, + "loss": 2.6867, + "mean_token_accuracy": 0.44416362047195435, + "num_tokens": 8047444354.0, + "step": 15743 + }, + { + "epoch": 4.2574364521362895, + "grad_norm": 3.6875, + "learning_rate": 0.0030219534232469305, + "loss": 2.6439, + "mean_token_accuracy": 0.4427258372306824, + "num_tokens": 8047968629.0, + "step": 15744 + }, + { + "epoch": 4.25770686857761, + "grad_norm": 3.65625, + "learning_rate": 0.003021223906874212, + "loss": 2.8404, + "mean_token_accuracy": 0.39970993995666504, + "num_tokens": 8048492717.0, + "step": 15745 + }, + { + "epoch": 4.257977285018929, + "grad_norm": 3.09375, + "learning_rate": 0.0030204946353138084, + "loss": 2.7836, + "mean_token_accuracy": 0.433090478181839, + "num_tokens": 8048959113.0, + "step": 15746 + }, + { + "epoch": 4.258247701460249, + "grad_norm": 2.796875, + "learning_rate": 0.0030197656085880955, + "loss": 2.7046, + "mean_token_accuracy": 0.4512650966644287, + "num_tokens": 8049483270.0, + "step": 15747 + }, + { + "epoch": 4.258518117901568, + "grad_norm": 3.515625, + "learning_rate": 0.0030190368267194435, + "loss": 2.7788, + "mean_token_accuracy": 0.44794145226478577, + "num_tokens": 8049977716.0, + "step": 15748 + }, + { + "epoch": 4.258788534342888, + "grad_norm": 3.75, + "learning_rate": 0.0030183082897302115, + "loss": 2.765, + "mean_token_accuracy": 0.4578207731246948, + "num_tokens": 8050501891.0, + "step": 15749 + }, + { + "epoch": 4.259058950784207, + "grad_norm": 3.25, + "learning_rate": 0.0030175799976427525, + "loss": 2.7865, + "mean_token_accuracy": 0.4533543288707733, + "num_tokens": 8051026164.0, + "step": 15750 + }, + { + "epoch": 4.2593293672255275, + "grad_norm": 118.5, + "learning_rate": 0.003016851950479415, + "loss": 4.9599, + "mean_token_accuracy": 0.23505064845085144, + "num_tokens": 8051525805.0, + "step": 15751 + }, + { + "epoch": 4.259599783666847, + "grad_norm": 7.15625, + "learning_rate": 0.003016124148262538, + "loss": 2.7953, + "mean_token_accuracy": 0.44458770751953125, + "num_tokens": 8052009734.0, + "step": 15752 + }, + { + "epoch": 4.259870200108167, + "grad_norm": 3.046875, + "learning_rate": 0.0030153965910144475, + "loss": 2.8102, + "mean_token_accuracy": 0.4503263235092163, + "num_tokens": 8052534010.0, + "step": 15753 + }, + { + "epoch": 4.260140616549486, + "grad_norm": 3.875, + "learning_rate": 0.0030146692787574743, + "loss": 2.8535, + "mean_token_accuracy": 0.4263055920600891, + "num_tokens": 8053058205.0, + "step": 15754 + }, + { + "epoch": 4.260411032990806, + "grad_norm": 3.421875, + "learning_rate": 0.0030139422115139283, + "loss": 2.9175, + "mean_token_accuracy": 0.4268200099468231, + "num_tokens": 8053582484.0, + "step": 15755 + }, + { + "epoch": 4.260681449432125, + "grad_norm": 3.703125, + "learning_rate": 0.0030132153893061226, + "loss": 2.8584, + "mean_token_accuracy": 0.44724875688552856, + "num_tokens": 8054106680.0, + "step": 15756 + }, + { + "epoch": 4.260951865873445, + "grad_norm": 3.9375, + "learning_rate": 0.0030124888121563554, + "loss": 2.6399, + "mean_token_accuracy": 0.44541388750076294, + "num_tokens": 8054630872.0, + "step": 15757 + }, + { + "epoch": 4.2612222823147645, + "grad_norm": 3.703125, + "learning_rate": 0.0030117624800869215, + "loss": 2.8613, + "mean_token_accuracy": 0.4277408719062805, + "num_tokens": 8055155113.0, + "step": 15758 + }, + { + "epoch": 4.261492698756085, + "grad_norm": 3.1875, + "learning_rate": 0.003011036393120105, + "loss": 2.8122, + "mean_token_accuracy": 0.4373628497123718, + "num_tokens": 8055679393.0, + "step": 15759 + }, + { + "epoch": 4.261763115197404, + "grad_norm": 3.265625, + "learning_rate": 0.003010310551278188, + "loss": 2.7237, + "mean_token_accuracy": 0.44468462467193604, + "num_tokens": 8056203590.0, + "step": 15760 + }, + { + "epoch": 4.262033531638724, + "grad_norm": 3.671875, + "learning_rate": 0.0030095849545834392, + "loss": 2.9661, + "mean_token_accuracy": 0.4209299683570862, + "num_tokens": 8056727862.0, + "step": 15761 + }, + { + "epoch": 4.262303948080043, + "grad_norm": 3.890625, + "learning_rate": 0.003008859603058122, + "loss": 2.9258, + "mean_token_accuracy": 0.42058372497558594, + "num_tokens": 8057252010.0, + "step": 15762 + }, + { + "epoch": 4.262574364521363, + "grad_norm": 3.46875, + "learning_rate": 0.003008134496724494, + "loss": 2.7365, + "mean_token_accuracy": 0.42967748641967773, + "num_tokens": 8057776244.0, + "step": 15763 + }, + { + "epoch": 4.262844780962682, + "grad_norm": 3.703125, + "learning_rate": 0.003007409635604799, + "loss": 2.682, + "mean_token_accuracy": 0.43762364983558655, + "num_tokens": 8058294917.0, + "step": 15764 + }, + { + "epoch": 4.2631151974040025, + "grad_norm": 3.6875, + "learning_rate": 0.0030066850197212843, + "loss": 2.728, + "mean_token_accuracy": 0.46002325415611267, + "num_tokens": 8058819046.0, + "step": 15765 + }, + { + "epoch": 4.263385613845322, + "grad_norm": 3.640625, + "learning_rate": 0.003005960649096179, + "loss": 2.8271, + "mean_token_accuracy": 0.44275736808776855, + "num_tokens": 8059343069.0, + "step": 15766 + }, + { + "epoch": 4.263656030286642, + "grad_norm": 3.25, + "learning_rate": 0.0030052365237517088, + "loss": 2.741, + "mean_token_accuracy": 0.4560042917728424, + "num_tokens": 8059782587.0, + "step": 15767 + }, + { + "epoch": 4.263926446727961, + "grad_norm": 3.21875, + "learning_rate": 0.003004512643710094, + "loss": 2.7316, + "mean_token_accuracy": 0.4689110219478607, + "num_tokens": 8060267037.0, + "step": 15768 + }, + { + "epoch": 4.264196863169281, + "grad_norm": 3.859375, + "learning_rate": 0.0030037890089935453, + "loss": 2.8072, + "mean_token_accuracy": 0.4309747815132141, + "num_tokens": 8060791317.0, + "step": 15769 + }, + { + "epoch": 4.2644672796106, + "grad_norm": 3.609375, + "learning_rate": 0.003003065619624263, + "loss": 2.8388, + "mean_token_accuracy": 0.4464949369430542, + "num_tokens": 8061241884.0, + "step": 15770 + }, + { + "epoch": 4.26473769605192, + "grad_norm": 88.0, + "learning_rate": 0.003002342475624447, + "loss": 8.0528, + "mean_token_accuracy": 0.06172147020697594, + "num_tokens": 8061766099.0, + "step": 15771 + }, + { + "epoch": 4.2650081124932395, + "grad_norm": 8.5625, + "learning_rate": 0.003001619577016281, + "loss": 2.6939, + "mean_token_accuracy": 0.43106693029403687, + "num_tokens": 8062290310.0, + "step": 15772 + }, + { + "epoch": 4.26527852893456, + "grad_norm": 3.015625, + "learning_rate": 0.0030008969238219485, + "loss": 2.9025, + "mean_token_accuracy": 0.4232204258441925, + "num_tokens": 8062814376.0, + "step": 15773 + }, + { + "epoch": 4.265548945375879, + "grad_norm": 3.90625, + "learning_rate": 0.003000174516063624, + "loss": 2.6876, + "mean_token_accuracy": 0.4548313319683075, + "num_tokens": 8063338643.0, + "step": 15774 + }, + { + "epoch": 4.265819361817199, + "grad_norm": 3.203125, + "learning_rate": 0.002999452353763471, + "loss": 2.6531, + "mean_token_accuracy": 0.4627910852432251, + "num_tokens": 8063862882.0, + "step": 15775 + }, + { + "epoch": 4.266089778258518, + "grad_norm": 3.75, + "learning_rate": 0.0029987304369436464, + "loss": 2.7656, + "mean_token_accuracy": 0.44166186451911926, + "num_tokens": 8064361693.0, + "step": 15776 + }, + { + "epoch": 4.266360194699837, + "grad_norm": 3.5, + "learning_rate": 0.0029980087656263045, + "loss": 2.9369, + "mean_token_accuracy": 0.43948960304260254, + "num_tokens": 8064885904.0, + "step": 15777 + }, + { + "epoch": 4.266630611141157, + "grad_norm": 4.5, + "learning_rate": 0.0029972873398335835, + "loss": 3.042, + "mean_token_accuracy": 0.41232380270957947, + "num_tokens": 8065410187.0, + "step": 15778 + }, + { + "epoch": 4.2669010275824775, + "grad_norm": 3.46875, + "learning_rate": 0.002996566159587623, + "loss": 2.7665, + "mean_token_accuracy": 0.42444801330566406, + "num_tokens": 8065903846.0, + "step": 15779 + }, + { + "epoch": 4.267171444023797, + "grad_norm": 3.671875, + "learning_rate": 0.0029958452249105494, + "loss": 2.6508, + "mean_token_accuracy": 0.46717748045921326, + "num_tokens": 8066428088.0, + "step": 15780 + }, + { + "epoch": 4.267441860465116, + "grad_norm": 4.21875, + "learning_rate": 0.0029951245358244816, + "loss": 2.9163, + "mean_token_accuracy": 0.4411618113517761, + "num_tokens": 8066952180.0, + "step": 15781 + }, + { + "epoch": 4.267712276906436, + "grad_norm": 3.640625, + "learning_rate": 0.0029944040923515357, + "loss": 2.7285, + "mean_token_accuracy": 0.43050605058670044, + "num_tokens": 8067476353.0, + "step": 15782 + }, + { + "epoch": 4.267982693347755, + "grad_norm": 3.359375, + "learning_rate": 0.0029936838945138146, + "loss": 2.9157, + "mean_token_accuracy": 0.42264118790626526, + "num_tokens": 8068000519.0, + "step": 15783 + }, + { + "epoch": 4.268253109789075, + "grad_norm": 3.875, + "learning_rate": 0.0029929639423334153, + "loss": 2.6983, + "mean_token_accuracy": 0.45876991748809814, + "num_tokens": 8068524696.0, + "step": 15784 + }, + { + "epoch": 4.268523526230394, + "grad_norm": 3.359375, + "learning_rate": 0.0029922442358324315, + "loss": 2.8059, + "mean_token_accuracy": 0.46661579608917236, + "num_tokens": 8069023837.0, + "step": 15785 + }, + { + "epoch": 4.2687939426717145, + "grad_norm": 3.34375, + "learning_rate": 0.002991524775032941, + "loss": 2.7443, + "mean_token_accuracy": 0.4603274464607239, + "num_tokens": 8069548015.0, + "step": 15786 + }, + { + "epoch": 4.269064359113034, + "grad_norm": 3.65625, + "learning_rate": 0.0029908055599570235, + "loss": 2.72, + "mean_token_accuracy": 0.45638835430145264, + "num_tokens": 8070017638.0, + "step": 15787 + }, + { + "epoch": 4.269334775554354, + "grad_norm": 3.5625, + "learning_rate": 0.002990086590626745, + "loss": 2.9051, + "mean_token_accuracy": 0.4189991354942322, + "num_tokens": 8070541863.0, + "step": 15788 + }, + { + "epoch": 4.269605191995673, + "grad_norm": 3.703125, + "learning_rate": 0.0029893678670641663, + "loss": 2.8905, + "mean_token_accuracy": 0.43879055976867676, + "num_tokens": 8071066071.0, + "step": 15789 + }, + { + "epoch": 4.269875608436993, + "grad_norm": 3.625, + "learning_rate": 0.002988649389291337, + "loss": 2.8136, + "mean_token_accuracy": 0.4435293674468994, + "num_tokens": 8071590257.0, + "step": 15790 + }, + { + "epoch": 4.270146024878312, + "grad_norm": 118.5, + "learning_rate": 0.002987931157330306, + "loss": 9.6359, + "mean_token_accuracy": 0.04515842720866203, + "num_tokens": 8072114536.0, + "step": 15791 + }, + { + "epoch": 4.270416441319632, + "grad_norm": 7.8125, + "learning_rate": 0.0029872131712031097, + "loss": 2.6938, + "mean_token_accuracy": 0.45075494050979614, + "num_tokens": 8072638546.0, + "step": 15792 + }, + { + "epoch": 4.270686857760952, + "grad_norm": 2.1875, + "learning_rate": 0.0029864954309317755, + "loss": 2.8148, + "mean_token_accuracy": 0.44148576259613037, + "num_tokens": 8073110556.0, + "step": 15793 + }, + { + "epoch": 4.270957274202272, + "grad_norm": 2.703125, + "learning_rate": 0.0029857779365383284, + "loss": 2.8176, + "mean_token_accuracy": 0.44321757555007935, + "num_tokens": 8073634769.0, + "step": 15794 + }, + { + "epoch": 4.271227690643591, + "grad_norm": 2.5, + "learning_rate": 0.002985060688044784, + "loss": 2.6956, + "mean_token_accuracy": 0.43669021129608154, + "num_tokens": 8074158976.0, + "step": 15795 + }, + { + "epoch": 4.271498107084911, + "grad_norm": 3.4375, + "learning_rate": 0.0029843436854731452, + "loss": 2.8945, + "mean_token_accuracy": 0.43389892578125, + "num_tokens": 8074683248.0, + "step": 15796 + }, + { + "epoch": 4.27176852352623, + "grad_norm": 3.515625, + "learning_rate": 0.0029836269288454165, + "loss": 2.6826, + "mean_token_accuracy": 0.42442500591278076, + "num_tokens": 8075207516.0, + "step": 15797 + }, + { + "epoch": 4.27203893996755, + "grad_norm": 3.109375, + "learning_rate": 0.0029829104181835894, + "loss": 2.8266, + "mean_token_accuracy": 0.45217394828796387, + "num_tokens": 8075731619.0, + "step": 15798 + }, + { + "epoch": 4.272309356408869, + "grad_norm": 3.796875, + "learning_rate": 0.0029821941535096443, + "loss": 2.9474, + "mean_token_accuracy": 0.43512606620788574, + "num_tokens": 8076171692.0, + "step": 15799 + }, + { + "epoch": 4.2725797728501895, + "grad_norm": 3.640625, + "learning_rate": 0.0029814781348455623, + "loss": 2.7684, + "mean_token_accuracy": 0.43856358528137207, + "num_tokens": 8076695930.0, + "step": 15800 + }, + { + "epoch": 4.272850189291509, + "grad_norm": 4.40625, + "learning_rate": 0.0029807623622133132, + "loss": 3.0476, + "mean_token_accuracy": 0.41037604212760925, + "num_tokens": 8077196271.0, + "step": 15801 + }, + { + "epoch": 4.273120605732829, + "grad_norm": 3.5625, + "learning_rate": 0.002980046835634856, + "loss": 2.8308, + "mean_token_accuracy": 0.41913628578186035, + "num_tokens": 8077720521.0, + "step": 15802 + }, + { + "epoch": 4.273391022174148, + "grad_norm": 3.65625, + "learning_rate": 0.0029793315551321494, + "loss": 2.681, + "mean_token_accuracy": 0.46167147159576416, + "num_tokens": 8078208888.0, + "step": 15803 + }, + { + "epoch": 4.273661438615468, + "grad_norm": 3.59375, + "learning_rate": 0.0029786165207271354, + "loss": 2.6568, + "mean_token_accuracy": 0.44024622440338135, + "num_tokens": 8078733145.0, + "step": 15804 + }, + { + "epoch": 4.273931855056787, + "grad_norm": 3.8125, + "learning_rate": 0.002977901732441759, + "loss": 2.8762, + "mean_token_accuracy": 0.4411267638206482, + "num_tokens": 8079196120.0, + "step": 15805 + }, + { + "epoch": 4.274202271498107, + "grad_norm": 4.53125, + "learning_rate": 0.0029771871902979467, + "loss": 2.7708, + "mean_token_accuracy": 0.423347532749176, + "num_tokens": 8079720399.0, + "step": 15806 + }, + { + "epoch": 4.274472687939427, + "grad_norm": 4.15625, + "learning_rate": 0.002976472894317625, + "loss": 2.9553, + "mean_token_accuracy": 0.43123576045036316, + "num_tokens": 8080216453.0, + "step": 15807 + }, + { + "epoch": 4.274743104380747, + "grad_norm": 4.125, + "learning_rate": 0.002975758844522712, + "loss": 2.8055, + "mean_token_accuracy": 0.4340951144695282, + "num_tokens": 8080740657.0, + "step": 15808 + }, + { + "epoch": 4.275013520822066, + "grad_norm": 4.59375, + "learning_rate": 0.0029750450409351155, + "loss": 2.7986, + "mean_token_accuracy": 0.44395047426223755, + "num_tokens": 8081264654.0, + "step": 15809 + }, + { + "epoch": 4.275283937263386, + "grad_norm": 3.796875, + "learning_rate": 0.002974331483576735, + "loss": 2.9674, + "mean_token_accuracy": 0.41837242245674133, + "num_tokens": 8081788931.0, + "step": 15810 + }, + { + "epoch": 4.275554353704705, + "grad_norm": 73.0, + "learning_rate": 0.002973618172469468, + "loss": 7.2646, + "mean_token_accuracy": 0.08501394093036652, + "num_tokens": 8082313031.0, + "step": 15811 + }, + { + "epoch": 4.275824770146025, + "grad_norm": 8.8125, + "learning_rate": 0.0029729051076352, + "loss": 2.7015, + "mean_token_accuracy": 0.4211067855358124, + "num_tokens": 8082837289.0, + "step": 15812 + }, + { + "epoch": 4.276095186587344, + "grad_norm": 3.03125, + "learning_rate": 0.0029721922890958074, + "loss": 2.6528, + "mean_token_accuracy": 0.44664591550827026, + "num_tokens": 8083361408.0, + "step": 15813 + }, + { + "epoch": 4.2763656030286645, + "grad_norm": 3.40625, + "learning_rate": 0.0029714797168731654, + "loss": 2.8414, + "mean_token_accuracy": 0.43023574352264404, + "num_tokens": 8083885659.0, + "step": 15814 + }, + { + "epoch": 4.276636019469984, + "grad_norm": 2.9375, + "learning_rate": 0.0029707673909891348, + "loss": 2.8084, + "mean_token_accuracy": 0.45700621604919434, + "num_tokens": 8084352564.0, + "step": 15815 + }, + { + "epoch": 4.276906435911304, + "grad_norm": 7.59375, + "learning_rate": 0.0029700553114655726, + "loss": 2.4932, + "mean_token_accuracy": 0.47139644622802734, + "num_tokens": 8084876849.0, + "step": 15816 + }, + { + "epoch": 4.277176852352623, + "grad_norm": 3.375, + "learning_rate": 0.002969343478324329, + "loss": 2.9493, + "mean_token_accuracy": 0.42515748739242554, + "num_tokens": 8085401018.0, + "step": 15817 + }, + { + "epoch": 4.277447268793942, + "grad_norm": 3.25, + "learning_rate": 0.0029686318915872444, + "loss": 2.6703, + "mean_token_accuracy": 0.46779540181159973, + "num_tokens": 8085867646.0, + "step": 15818 + }, + { + "epoch": 4.277717685235262, + "grad_norm": 3.734375, + "learning_rate": 0.002967920551276152, + "loss": 2.7312, + "mean_token_accuracy": 0.44033676385879517, + "num_tokens": 8086391908.0, + "step": 15819 + }, + { + "epoch": 4.277988101676582, + "grad_norm": 4.28125, + "learning_rate": 0.002967209457412878, + "loss": 2.9579, + "mean_token_accuracy": 0.3931163251399994, + "num_tokens": 8086916190.0, + "step": 15820 + }, + { + "epoch": 4.278258518117902, + "grad_norm": 3.796875, + "learning_rate": 0.0029664986100192414, + "loss": 2.9538, + "mean_token_accuracy": 0.4270288646221161, + "num_tokens": 8087440433.0, + "step": 15821 + }, + { + "epoch": 4.278528934559221, + "grad_norm": 4.78125, + "learning_rate": 0.0029657880091170496, + "loss": 2.8397, + "mean_token_accuracy": 0.43419405817985535, + "num_tokens": 8087964590.0, + "step": 15822 + }, + { + "epoch": 4.278799351000541, + "grad_norm": 3.546875, + "learning_rate": 0.002965077654728112, + "loss": 2.7835, + "mean_token_accuracy": 0.44648146629333496, + "num_tokens": 8088488654.0, + "step": 15823 + }, + { + "epoch": 4.27906976744186, + "grad_norm": 4.96875, + "learning_rate": 0.0029643675468742185, + "loss": 2.8921, + "mean_token_accuracy": 0.41350018978118896, + "num_tokens": 8089012922.0, + "step": 15824 + }, + { + "epoch": 4.27934018388318, + "grad_norm": 3.609375, + "learning_rate": 0.002963657685577162, + "loss": 2.6723, + "mean_token_accuracy": 0.4471854567527771, + "num_tokens": 8089537129.0, + "step": 15825 + }, + { + "epoch": 4.279610600324499, + "grad_norm": 3.953125, + "learning_rate": 0.0029629480708587208, + "loss": 2.7191, + "mean_token_accuracy": 0.43854227662086487, + "num_tokens": 8090040567.0, + "step": 15826 + }, + { + "epoch": 4.2798810167658194, + "grad_norm": 3.78125, + "learning_rate": 0.002962238702740667, + "loss": 2.5569, + "mean_token_accuracy": 0.4413839280605316, + "num_tokens": 8090564777.0, + "step": 15827 + }, + { + "epoch": 4.280151433207139, + "grad_norm": 2.859375, + "learning_rate": 0.002961529581244769, + "loss": 2.9324, + "mean_token_accuracy": 0.42177814245224, + "num_tokens": 8091088926.0, + "step": 15828 + }, + { + "epoch": 4.280421849648459, + "grad_norm": 3.75, + "learning_rate": 0.002960820706392781, + "loss": 2.9062, + "mean_token_accuracy": 0.4224337935447693, + "num_tokens": 8091613090.0, + "step": 15829 + }, + { + "epoch": 4.280692266089778, + "grad_norm": 3.90625, + "learning_rate": 0.002960112078206455, + "loss": 3.0701, + "mean_token_accuracy": 0.4100463092327118, + "num_tokens": 8092137178.0, + "step": 15830 + }, + { + "epoch": 4.280962682531098, + "grad_norm": 75.5, + "learning_rate": 0.0029594036967075365, + "loss": 8.573, + "mean_token_accuracy": 0.046109169721603394, + "num_tokens": 8092611337.0, + "step": 15831 + }, + { + "epoch": 4.281233098972417, + "grad_norm": 6.25, + "learning_rate": 0.002958695561917758, + "loss": 2.7909, + "mean_token_accuracy": 0.4297061562538147, + "num_tokens": 8093091601.0, + "step": 15832 + }, + { + "epoch": 4.281503515413737, + "grad_norm": 3.5625, + "learning_rate": 0.0029579876738588464, + "loss": 3.0308, + "mean_token_accuracy": 0.42255938053131104, + "num_tokens": 8093615873.0, + "step": 15833 + }, + { + "epoch": 4.2817739318550565, + "grad_norm": 3.90625, + "learning_rate": 0.002957280032552524, + "loss": 2.8616, + "mean_token_accuracy": 0.4596409499645233, + "num_tokens": 8094091493.0, + "step": 15834 + }, + { + "epoch": 4.282044348296377, + "grad_norm": 3.078125, + "learning_rate": 0.0029565726380205025, + "loss": 2.9055, + "mean_token_accuracy": 0.4276365637779236, + "num_tokens": 8094615768.0, + "step": 15835 + }, + { + "epoch": 4.282314764737696, + "grad_norm": 5.0, + "learning_rate": 0.002955865490284485, + "loss": 2.359, + "mean_token_accuracy": 0.495593786239624, + "num_tokens": 8095140005.0, + "step": 15836 + }, + { + "epoch": 4.282585181179016, + "grad_norm": 3.265625, + "learning_rate": 0.0029551585893661733, + "loss": 2.8235, + "mean_token_accuracy": 0.43292516469955444, + "num_tokens": 8095664191.0, + "step": 15837 + }, + { + "epoch": 4.282855597620335, + "grad_norm": 3.859375, + "learning_rate": 0.0029544519352872533, + "loss": 3.022, + "mean_token_accuracy": 0.4367990791797638, + "num_tokens": 8096125650.0, + "step": 15838 + }, + { + "epoch": 4.283126014061655, + "grad_norm": 3.875, + "learning_rate": 0.0029537455280694066, + "loss": 2.8048, + "mean_token_accuracy": 0.435922771692276, + "num_tokens": 8096649827.0, + "step": 15839 + }, + { + "epoch": 4.283396430502974, + "grad_norm": 3.921875, + "learning_rate": 0.002953039367734312, + "loss": 2.8315, + "mean_token_accuracy": 0.4341246485710144, + "num_tokens": 8097174048.0, + "step": 15840 + }, + { + "epoch": 4.2836668469442944, + "grad_norm": 3.734375, + "learning_rate": 0.0029523334543036335, + "loss": 2.8109, + "mean_token_accuracy": 0.44639262557029724, + "num_tokens": 8097698160.0, + "step": 15841 + }, + { + "epoch": 4.283937263385614, + "grad_norm": 4.5, + "learning_rate": 0.00295162778779903, + "loss": 2.7043, + "mean_token_accuracy": 0.4063856899738312, + "num_tokens": 8098222167.0, + "step": 15842 + }, + { + "epoch": 4.284207679826934, + "grad_norm": 3.078125, + "learning_rate": 0.002950922368242154, + "loss": 2.7739, + "mean_token_accuracy": 0.472153902053833, + "num_tokens": 8098746313.0, + "step": 15843 + }, + { + "epoch": 4.284478096268253, + "grad_norm": 4.125, + "learning_rate": 0.0029502171956546533, + "loss": 2.6001, + "mean_token_accuracy": 0.4332856237888336, + "num_tokens": 8099270516.0, + "step": 15844 + }, + { + "epoch": 4.284748512709573, + "grad_norm": 2.796875, + "learning_rate": 0.002949512270058159, + "loss": 2.7901, + "mean_token_accuracy": 0.44549834728240967, + "num_tokens": 8099755879.0, + "step": 15845 + }, + { + "epoch": 4.285018929150892, + "grad_norm": 3.640625, + "learning_rate": 0.002948807591474306, + "loss": 2.8907, + "mean_token_accuracy": 0.4293522834777832, + "num_tokens": 8100280050.0, + "step": 15846 + }, + { + "epoch": 4.285289345592212, + "grad_norm": 3.71875, + "learning_rate": 0.002948103159924711, + "loss": 3.0247, + "mean_token_accuracy": 0.4236348867416382, + "num_tokens": 8100804224.0, + "step": 15847 + }, + { + "epoch": 4.2855597620335315, + "grad_norm": 3.78125, + "learning_rate": 0.002947398975430991, + "loss": 2.8753, + "mean_token_accuracy": 0.43756502866744995, + "num_tokens": 8101328470.0, + "step": 15848 + }, + { + "epoch": 4.285830178474852, + "grad_norm": 3.734375, + "learning_rate": 0.002946695038014753, + "loss": 2.7468, + "mean_token_accuracy": 0.42744892835617065, + "num_tokens": 8101844387.0, + "step": 15849 + }, + { + "epoch": 4.286100594916171, + "grad_norm": 3.171875, + "learning_rate": 0.002945991347697592, + "loss": 2.7676, + "mean_token_accuracy": 0.44560784101486206, + "num_tokens": 8102368634.0, + "step": 15850 + }, + { + "epoch": 4.286371011357491, + "grad_norm": 53.25, + "learning_rate": 0.0029452879045011038, + "loss": 4.8968, + "mean_token_accuracy": 0.2646622359752655, + "num_tokens": 8102861580.0, + "step": 15851 + }, + { + "epoch": 4.28664142779881, + "grad_norm": 7.34375, + "learning_rate": 0.00294458470844687, + "loss": 2.83, + "mean_token_accuracy": 0.46323031187057495, + "num_tokens": 8103323836.0, + "step": 15852 + }, + { + "epoch": 4.28691184424013, + "grad_norm": 2.546875, + "learning_rate": 0.0029438817595564644, + "loss": 2.7211, + "mean_token_accuracy": 0.4451950192451477, + "num_tokens": 8103848008.0, + "step": 15853 + }, + { + "epoch": 4.287182260681449, + "grad_norm": 3.765625, + "learning_rate": 0.00294317905785146, + "loss": 2.8533, + "mean_token_accuracy": 0.41192328929901123, + "num_tokens": 8104341710.0, + "step": 15854 + }, + { + "epoch": 4.2874526771227695, + "grad_norm": 3.25, + "learning_rate": 0.0029424766033534166, + "loss": 2.8864, + "mean_token_accuracy": 0.41361352801322937, + "num_tokens": 8104865835.0, + "step": 15855 + }, + { + "epoch": 4.287723093564089, + "grad_norm": 4.46875, + "learning_rate": 0.002941774396083884, + "loss": 2.8823, + "mean_token_accuracy": 0.4387882947921753, + "num_tokens": 8105390111.0, + "step": 15856 + }, + { + "epoch": 4.287993510005409, + "grad_norm": 4.09375, + "learning_rate": 0.00294107243606441, + "loss": 2.781, + "mean_token_accuracy": 0.4510575532913208, + "num_tokens": 8105871638.0, + "step": 15857 + }, + { + "epoch": 4.288263926446728, + "grad_norm": 20.375, + "learning_rate": 0.0029403707233165368, + "loss": 2.6163, + "mean_token_accuracy": 0.4553281366825104, + "num_tokens": 8106395773.0, + "step": 15858 + }, + { + "epoch": 4.288534342888047, + "grad_norm": 6.40625, + "learning_rate": 0.0029396692578617886, + "loss": 2.8871, + "mean_token_accuracy": 0.44258570671081543, + "num_tokens": 8106919900.0, + "step": 15859 + }, + { + "epoch": 4.288804759329367, + "grad_norm": 3.609375, + "learning_rate": 0.002938968039721694, + "loss": 2.8234, + "mean_token_accuracy": 0.4194699823856354, + "num_tokens": 8107444156.0, + "step": 15860 + }, + { + "epoch": 4.289075175770687, + "grad_norm": 3.28125, + "learning_rate": 0.0029382670689177648, + "loss": 2.6521, + "mean_token_accuracy": 0.453626811504364, + "num_tokens": 8107968432.0, + "step": 15861 + }, + { + "epoch": 4.2893455922120065, + "grad_norm": 3.78125, + "learning_rate": 0.0029375663454715086, + "loss": 2.8497, + "mean_token_accuracy": 0.44558966159820557, + "num_tokens": 8108492668.0, + "step": 15862 + }, + { + "epoch": 4.289616008653326, + "grad_norm": 3.765625, + "learning_rate": 0.0029368658694044294, + "loss": 2.8637, + "mean_token_accuracy": 0.43593332171440125, + "num_tokens": 8109016913.0, + "step": 15863 + }, + { + "epoch": 4.289886425094646, + "grad_norm": 3.96875, + "learning_rate": 0.0029361656407380166, + "loss": 2.7294, + "mean_token_accuracy": 0.4612410068511963, + "num_tokens": 8109487117.0, + "step": 15864 + }, + { + "epoch": 4.290156841535965, + "grad_norm": 3.703125, + "learning_rate": 0.002935465659493755, + "loss": 2.9183, + "mean_token_accuracy": 0.44000744819641113, + "num_tokens": 8110011396.0, + "step": 15865 + }, + { + "epoch": 4.290427257977285, + "grad_norm": 3.453125, + "learning_rate": 0.0029347659256931252, + "loss": 2.6637, + "mean_token_accuracy": 0.42633241415023804, + "num_tokens": 8110535594.0, + "step": 15866 + }, + { + "epoch": 4.290697674418604, + "grad_norm": 4.03125, + "learning_rate": 0.0029340664393575945, + "loss": 2.8098, + "mean_token_accuracy": 0.438418447971344, + "num_tokens": 8111059662.0, + "step": 15867 + }, + { + "epoch": 4.290968090859924, + "grad_norm": 3.265625, + "learning_rate": 0.002933367200508624, + "loss": 2.8183, + "mean_token_accuracy": 0.43117445707321167, + "num_tokens": 8111571592.0, + "step": 15868 + }, + { + "epoch": 4.291238507301244, + "grad_norm": 3.390625, + "learning_rate": 0.0029326682091676726, + "loss": 2.7292, + "mean_token_accuracy": 0.44986218214035034, + "num_tokens": 8112095856.0, + "step": 15869 + }, + { + "epoch": 4.291508923742564, + "grad_norm": 4.125, + "learning_rate": 0.0029319694653561837, + "loss": 2.7858, + "mean_token_accuracy": 0.4438364803791046, + "num_tokens": 8112620136.0, + "step": 15870 + }, + { + "epoch": 4.291779340183883, + "grad_norm": 42.0, + "learning_rate": 0.0029312709690955986, + "loss": 4.0977, + "mean_token_accuracy": 0.30248236656188965, + "num_tokens": 8113144387.0, + "step": 15871 + }, + { + "epoch": 4.292049756625203, + "grad_norm": 8.5625, + "learning_rate": 0.0029305727204073477, + "loss": 3.0759, + "mean_token_accuracy": 0.42181527614593506, + "num_tokens": 8113613841.0, + "step": 15872 + }, + { + "epoch": 4.292320173066522, + "grad_norm": 2.5625, + "learning_rate": 0.0029298747193128557, + "loss": 2.8378, + "mean_token_accuracy": 0.4471886157989502, + "num_tokens": 8114138093.0, + "step": 15873 + }, + { + "epoch": 4.292590589507842, + "grad_norm": 3.625, + "learning_rate": 0.002929176965833542, + "loss": 2.6935, + "mean_token_accuracy": 0.4462248682975769, + "num_tokens": 8114614772.0, + "step": 15874 + }, + { + "epoch": 4.292861005949161, + "grad_norm": 4.5625, + "learning_rate": 0.0029284794599908144, + "loss": 2.5075, + "mean_token_accuracy": 0.4511381685733795, + "num_tokens": 8115139031.0, + "step": 15875 + }, + { + "epoch": 4.2931314223904815, + "grad_norm": 3.546875, + "learning_rate": 0.0029277822018060717, + "loss": 2.7362, + "mean_token_accuracy": 0.44645220041275024, + "num_tokens": 8115663251.0, + "step": 15876 + }, + { + "epoch": 4.293401838831801, + "grad_norm": 3.21875, + "learning_rate": 0.002927085191300711, + "loss": 2.7767, + "mean_token_accuracy": 0.44195622205734253, + "num_tokens": 8116187398.0, + "step": 15877 + }, + { + "epoch": 4.293672255273121, + "grad_norm": 34.5, + "learning_rate": 0.0029263884284961177, + "loss": 2.431, + "mean_token_accuracy": 0.4821935296058655, + "num_tokens": 8116711635.0, + "step": 15878 + }, + { + "epoch": 4.29394267171444, + "grad_norm": 5.5, + "learning_rate": 0.00292569191341367, + "loss": 2.9289, + "mean_token_accuracy": 0.4238325357437134, + "num_tokens": 8117235887.0, + "step": 15879 + }, + { + "epoch": 4.29421308815576, + "grad_norm": 3.1875, + "learning_rate": 0.0029249956460747394, + "loss": 2.7192, + "mean_token_accuracy": 0.45981496572494507, + "num_tokens": 8117760078.0, + "step": 15880 + }, + { + "epoch": 4.294483504597079, + "grad_norm": 4.15625, + "learning_rate": 0.002924299626500691, + "loss": 2.8434, + "mean_token_accuracy": 0.43293559551239014, + "num_tokens": 8118284180.0, + "step": 15881 + }, + { + "epoch": 4.294753921038399, + "grad_norm": 3.859375, + "learning_rate": 0.002923603854712876, + "loss": 2.7431, + "mean_token_accuracy": 0.4570273756980896, + "num_tokens": 8118797253.0, + "step": 15882 + }, + { + "epoch": 4.295024337479719, + "grad_norm": 4.03125, + "learning_rate": 0.0029229083307326475, + "loss": 2.7672, + "mean_token_accuracy": 0.44958487153053284, + "num_tokens": 8119258285.0, + "step": 15883 + }, + { + "epoch": 4.295294753921039, + "grad_norm": 3.90625, + "learning_rate": 0.0029222130545813454, + "loss": 2.7277, + "mean_token_accuracy": 0.43704745173454285, + "num_tokens": 8119782500.0, + "step": 15884 + }, + { + "epoch": 4.295565170362358, + "grad_norm": 4.0625, + "learning_rate": 0.0029215180262803005, + "loss": 2.9154, + "mean_token_accuracy": 0.4257951080799103, + "num_tokens": 8120266396.0, + "step": 15885 + }, + { + "epoch": 4.295835586803678, + "grad_norm": 4.34375, + "learning_rate": 0.00292082324585084, + "loss": 2.7773, + "mean_token_accuracy": 0.44218772649765015, + "num_tokens": 8120790548.0, + "step": 15886 + }, + { + "epoch": 4.296106003244997, + "grad_norm": 3.671875, + "learning_rate": 0.002920128713314282, + "loss": 2.8513, + "mean_token_accuracy": 0.43513455986976624, + "num_tokens": 8121314719.0, + "step": 15887 + }, + { + "epoch": 4.296376419686317, + "grad_norm": 4.1875, + "learning_rate": 0.0029194344286919355, + "loss": 2.9603, + "mean_token_accuracy": 0.4318634867668152, + "num_tokens": 8121838888.0, + "step": 15888 + }, + { + "epoch": 4.296646836127636, + "grad_norm": 4.8125, + "learning_rate": 0.0029187403920051063, + "loss": 3.0181, + "mean_token_accuracy": 0.43386614322662354, + "num_tokens": 8122341103.0, + "step": 15889 + }, + { + "epoch": 4.2969172525689565, + "grad_norm": 4.1875, + "learning_rate": 0.0029180466032750867, + "loss": 2.706, + "mean_token_accuracy": 0.4494505822658539, + "num_tokens": 8122865191.0, + "step": 15890 + }, + { + "epoch": 4.297187669010276, + "grad_norm": 210.0, + "learning_rate": 0.0029173530625231626, + "loss": 4.8119, + "mean_token_accuracy": 0.27498534321784973, + "num_tokens": 8123389464.0, + "step": 15891 + }, + { + "epoch": 4.297458085451596, + "grad_norm": 7.84375, + "learning_rate": 0.0029166597697706186, + "loss": 2.8593, + "mean_token_accuracy": 0.44147807359695435, + "num_tokens": 8123913648.0, + "step": 15892 + }, + { + "epoch": 4.297728501892915, + "grad_norm": 2.9375, + "learning_rate": 0.002915966725038722, + "loss": 2.6793, + "mean_token_accuracy": 0.4671156406402588, + "num_tokens": 8124437797.0, + "step": 15893 + }, + { + "epoch": 4.297998918334235, + "grad_norm": 3.859375, + "learning_rate": 0.0029152739283487424, + "loss": 2.6147, + "mean_token_accuracy": 0.4617375135421753, + "num_tokens": 8124961881.0, + "step": 15894 + }, + { + "epoch": 4.298269334775554, + "grad_norm": 4.21875, + "learning_rate": 0.0029145813797219343, + "loss": 2.772, + "mean_token_accuracy": 0.421597957611084, + "num_tokens": 8125486160.0, + "step": 15895 + }, + { + "epoch": 4.298539751216874, + "grad_norm": 3.46875, + "learning_rate": 0.002913889079179545, + "loss": 2.6785, + "mean_token_accuracy": 0.4471129775047302, + "num_tokens": 8125970942.0, + "step": 15896 + }, + { + "epoch": 4.298810167658194, + "grad_norm": 3.890625, + "learning_rate": 0.00291319702674282, + "loss": 2.6926, + "mean_token_accuracy": 0.4471662640571594, + "num_tokens": 8126492258.0, + "step": 15897 + }, + { + "epoch": 4.299080584099514, + "grad_norm": 3.328125, + "learning_rate": 0.0029125052224329927, + "loss": 2.7599, + "mean_token_accuracy": 0.44290751218795776, + "num_tokens": 8127016504.0, + "step": 15898 + }, + { + "epoch": 4.299351000540833, + "grad_norm": 4.4375, + "learning_rate": 0.002911813666271287, + "loss": 2.759, + "mean_token_accuracy": 0.43791159987449646, + "num_tokens": 8127540666.0, + "step": 15899 + }, + { + "epoch": 4.299621416982152, + "grad_norm": 3.0, + "learning_rate": 0.002911122358278923, + "loss": 2.7582, + "mean_token_accuracy": 0.45063716173171997, + "num_tokens": 8128064681.0, + "step": 15900 + }, + { + "epoch": 4.299891833423472, + "grad_norm": 4.25, + "learning_rate": 0.002910431298477115, + "loss": 2.7459, + "mean_token_accuracy": 0.4393683671951294, + "num_tokens": 8128588905.0, + "step": 15901 + }, + { + "epoch": 4.300162249864792, + "grad_norm": 3.265625, + "learning_rate": 0.0029097404868870637, + "loss": 2.7584, + "mean_token_accuracy": 0.44057098031044006, + "num_tokens": 8129113186.0, + "step": 15902 + }, + { + "epoch": 4.300432666306111, + "grad_norm": 3.671875, + "learning_rate": 0.0029090499235299673, + "loss": 2.5767, + "mean_token_accuracy": 0.45498621463775635, + "num_tokens": 8129585487.0, + "step": 15903 + }, + { + "epoch": 4.300703082747431, + "grad_norm": 2.46875, + "learning_rate": 0.0029083596084270124, + "loss": 3.0245, + "mean_token_accuracy": 0.4317474365234375, + "num_tokens": 8130079571.0, + "step": 15904 + }, + { + "epoch": 4.300973499188751, + "grad_norm": 4.03125, + "learning_rate": 0.002907669541599381, + "loss": 2.8242, + "mean_token_accuracy": 0.45124930143356323, + "num_tokens": 8130603739.0, + "step": 15905 + }, + { + "epoch": 4.30124391563007, + "grad_norm": 5.34375, + "learning_rate": 0.002906979723068246, + "loss": 2.9055, + "mean_token_accuracy": 0.4369538426399231, + "num_tokens": 8131127947.0, + "step": 15906 + }, + { + "epoch": 4.30151433207139, + "grad_norm": 3.109375, + "learning_rate": 0.002906290152854773, + "loss": 2.7883, + "mean_token_accuracy": 0.44282299280166626, + "num_tokens": 8131652133.0, + "step": 15907 + }, + { + "epoch": 4.301784748512709, + "grad_norm": 4.0625, + "learning_rate": 0.0029056008309801197, + "loss": 2.8661, + "mean_token_accuracy": 0.4453536868095398, + "num_tokens": 8132176409.0, + "step": 15908 + }, + { + "epoch": 4.302055164954029, + "grad_norm": 3.453125, + "learning_rate": 0.002904911757465438, + "loss": 2.7702, + "mean_token_accuracy": 0.45329588651657104, + "num_tokens": 8132643196.0, + "step": 15909 + }, + { + "epoch": 4.3023255813953485, + "grad_norm": 3.3125, + "learning_rate": 0.0029042229323318703, + "loss": 2.805, + "mean_token_accuracy": 0.4497835338115692, + "num_tokens": 8133145514.0, + "step": 15910 + }, + { + "epoch": 4.302595997836669, + "grad_norm": 43.5, + "learning_rate": 0.0029035343556005483, + "loss": 3.7936, + "mean_token_accuracy": 0.3402097225189209, + "num_tokens": 8133669580.0, + "step": 15911 + }, + { + "epoch": 4.302866414277988, + "grad_norm": 7.4375, + "learning_rate": 0.002902846027292605, + "loss": 2.2165, + "mean_token_accuracy": 0.5173957347869873, + "num_tokens": 8134115717.0, + "step": 15912 + }, + { + "epoch": 4.303136830719308, + "grad_norm": 3.953125, + "learning_rate": 0.002902157947429156, + "loss": 2.826, + "mean_token_accuracy": 0.4276871085166931, + "num_tokens": 8134603282.0, + "step": 15913 + }, + { + "epoch": 4.303407247160627, + "grad_norm": 3.265625, + "learning_rate": 0.0029014701160313162, + "loss": 2.5521, + "mean_token_accuracy": 0.4760645031929016, + "num_tokens": 8135118899.0, + "step": 15914 + }, + { + "epoch": 4.303677663601947, + "grad_norm": 3.34375, + "learning_rate": 0.002900782533120189, + "loss": 2.7781, + "mean_token_accuracy": 0.44751447439193726, + "num_tokens": 8135643130.0, + "step": 15915 + }, + { + "epoch": 4.303948080043266, + "grad_norm": 3.421875, + "learning_rate": 0.0029000951987168708, + "loss": 2.6628, + "mean_token_accuracy": 0.45854368805885315, + "num_tokens": 8136132931.0, + "step": 15916 + }, + { + "epoch": 4.304218496484586, + "grad_norm": 3.296875, + "learning_rate": 0.002899408112842455, + "loss": 2.8056, + "mean_token_accuracy": 0.46295928955078125, + "num_tokens": 8136657026.0, + "step": 15917 + }, + { + "epoch": 4.304488912925906, + "grad_norm": 2.90625, + "learning_rate": 0.00289872127551802, + "loss": 2.8044, + "mean_token_accuracy": 0.43977826833724976, + "num_tokens": 8137181255.0, + "step": 15918 + }, + { + "epoch": 4.304759329367226, + "grad_norm": 2.9375, + "learning_rate": 0.0028980346867646385, + "loss": 2.9572, + "mean_token_accuracy": 0.44597041606903076, + "num_tokens": 8137689422.0, + "step": 15919 + }, + { + "epoch": 4.305029745808545, + "grad_norm": 3.625, + "learning_rate": 0.002897348346603381, + "loss": 2.8994, + "mean_token_accuracy": 0.44810017943382263, + "num_tokens": 8138213559.0, + "step": 15920 + }, + { + "epoch": 4.305300162249865, + "grad_norm": 3.703125, + "learning_rate": 0.0028966622550553045, + "loss": 2.8038, + "mean_token_accuracy": 0.43257585167884827, + "num_tokens": 8138737839.0, + "step": 15921 + }, + { + "epoch": 4.305570578691184, + "grad_norm": 3.796875, + "learning_rate": 0.0028959764121414596, + "loss": 2.7862, + "mean_token_accuracy": 0.4486433267593384, + "num_tokens": 8139219127.0, + "step": 15922 + }, + { + "epoch": 4.305840995132504, + "grad_norm": 4.0625, + "learning_rate": 0.0028952908178828916, + "loss": 2.9227, + "mean_token_accuracy": 0.4450684189796448, + "num_tokens": 8139730700.0, + "step": 15923 + }, + { + "epoch": 4.3061114115738235, + "grad_norm": 3.34375, + "learning_rate": 0.0028946054723006367, + "loss": 2.705, + "mean_token_accuracy": 0.4555175304412842, + "num_tokens": 8140243604.0, + "step": 15924 + }, + { + "epoch": 4.306381828015144, + "grad_norm": 3.71875, + "learning_rate": 0.0028939203754157197, + "loss": 2.856, + "mean_token_accuracy": 0.4136234521865845, + "num_tokens": 8140767788.0, + "step": 15925 + }, + { + "epoch": 4.306652244456463, + "grad_norm": 3.8125, + "learning_rate": 0.002893235527249165, + "loss": 2.838, + "mean_token_accuracy": 0.44064444303512573, + "num_tokens": 8141256765.0, + "step": 15926 + }, + { + "epoch": 4.306922660897783, + "grad_norm": 3.640625, + "learning_rate": 0.002892550927821984, + "loss": 2.851, + "mean_token_accuracy": 0.42737942934036255, + "num_tokens": 8141781025.0, + "step": 15927 + }, + { + "epoch": 4.307193077339102, + "grad_norm": 3.78125, + "learning_rate": 0.002891866577155185, + "loss": 2.8297, + "mean_token_accuracy": 0.4431667625904083, + "num_tokens": 8142300308.0, + "step": 15928 + }, + { + "epoch": 4.307463493780422, + "grad_norm": 3.859375, + "learning_rate": 0.0028911824752697614, + "loss": 2.8445, + "mean_token_accuracy": 0.43779921531677246, + "num_tokens": 8142781781.0, + "step": 15929 + }, + { + "epoch": 4.307733910221741, + "grad_norm": 4.84375, + "learning_rate": 0.002890498622186708, + "loss": 2.6523, + "mean_token_accuracy": 0.4332744777202606, + "num_tokens": 8143306022.0, + "step": 15930 + }, + { + "epoch": 4.308004326663061, + "grad_norm": 64.0, + "learning_rate": 0.0028898150179270045, + "loss": 3.7842, + "mean_token_accuracy": 0.38522544503211975, + "num_tokens": 8143830281.0, + "step": 15931 + }, + { + "epoch": 4.308274743104381, + "grad_norm": 5.8125, + "learning_rate": 0.0028891316625116272, + "loss": 2.9548, + "mean_token_accuracy": 0.4419178366661072, + "num_tokens": 8144354441.0, + "step": 15932 + }, + { + "epoch": 4.308545159545701, + "grad_norm": 38.25, + "learning_rate": 0.002888448555961544, + "loss": 3.0095, + "mean_token_accuracy": 0.4501495659351349, + "num_tokens": 8144813684.0, + "step": 15933 + }, + { + "epoch": 4.30881557598702, + "grad_norm": 6.34375, + "learning_rate": 0.0028877656982977114, + "loss": 2.7483, + "mean_token_accuracy": 0.453720360994339, + "num_tokens": 8145314335.0, + "step": 15934 + }, + { + "epoch": 4.30908599242834, + "grad_norm": 3.0625, + "learning_rate": 0.0028870830895410854, + "loss": 2.8388, + "mean_token_accuracy": 0.44370174407958984, + "num_tokens": 8145760241.0, + "step": 15935 + }, + { + "epoch": 4.309356408869659, + "grad_norm": 3.578125, + "learning_rate": 0.0028864007297126084, + "loss": 2.8905, + "mean_token_accuracy": 0.42781299352645874, + "num_tokens": 8146284479.0, + "step": 15936 + }, + { + "epoch": 4.309626825310979, + "grad_norm": 44.75, + "learning_rate": 0.0028857186188332162, + "loss": 2.6501, + "mean_token_accuracy": 0.48010802268981934, + "num_tokens": 8146808757.0, + "step": 15937 + }, + { + "epoch": 4.3098972417522985, + "grad_norm": 5.90625, + "learning_rate": 0.002885036756923841, + "loss": 2.9088, + "mean_token_accuracy": 0.42021501064300537, + "num_tokens": 8147332780.0, + "step": 15938 + }, + { + "epoch": 4.310167658193619, + "grad_norm": 3.265625, + "learning_rate": 0.0028843551440054013, + "loss": 2.8715, + "mean_token_accuracy": 0.4512428045272827, + "num_tokens": 8147797794.0, + "step": 15939 + }, + { + "epoch": 4.310438074634938, + "grad_norm": 4.21875, + "learning_rate": 0.0028836737800988126, + "loss": 2.7943, + "mean_token_accuracy": 0.44749724864959717, + "num_tokens": 8148321888.0, + "step": 15940 + }, + { + "epoch": 4.310708491076257, + "grad_norm": 4.03125, + "learning_rate": 0.0028829926652249823, + "loss": 2.791, + "mean_token_accuracy": 0.4445733428001404, + "num_tokens": 8148846063.0, + "step": 15941 + }, + { + "epoch": 4.310978907517577, + "grad_norm": 3.859375, + "learning_rate": 0.0028823117994048065, + "loss": 2.635, + "mean_token_accuracy": 0.43350279331207275, + "num_tokens": 8149370346.0, + "step": 15942 + }, + { + "epoch": 4.311249323958897, + "grad_norm": 3.078125, + "learning_rate": 0.0028816311826591764, + "loss": 3.0418, + "mean_token_accuracy": 0.4120248258113861, + "num_tokens": 8149894606.0, + "step": 15943 + }, + { + "epoch": 4.311519740400216, + "grad_norm": 3.71875, + "learning_rate": 0.0028809508150089776, + "loss": 2.773, + "mean_token_accuracy": 0.4309733211994171, + "num_tokens": 8150418773.0, + "step": 15944 + }, + { + "epoch": 4.3117901568415355, + "grad_norm": 15.0625, + "learning_rate": 0.0028802706964750823, + "loss": 2.6421, + "mean_token_accuracy": 0.46236926317214966, + "num_tokens": 8150943036.0, + "step": 15945 + }, + { + "epoch": 4.312060573282856, + "grad_norm": 3.75, + "learning_rate": 0.0028795908270783633, + "loss": 2.9808, + "mean_token_accuracy": 0.42744195461273193, + "num_tokens": 8151467278.0, + "step": 15946 + }, + { + "epoch": 4.312330989724175, + "grad_norm": 3.4375, + "learning_rate": 0.002878911206839678, + "loss": 2.8256, + "mean_token_accuracy": 0.44054192304611206, + "num_tokens": 8151981229.0, + "step": 15947 + }, + { + "epoch": 4.312601406165495, + "grad_norm": 3.03125, + "learning_rate": 0.0028782318357798774, + "loss": 2.6907, + "mean_token_accuracy": 0.42211970686912537, + "num_tokens": 8152505503.0, + "step": 15948 + }, + { + "epoch": 4.312871822606814, + "grad_norm": 3.328125, + "learning_rate": 0.002877552713919811, + "loss": 2.8116, + "mean_token_accuracy": 0.4259423017501831, + "num_tokens": 8153029707.0, + "step": 15949 + }, + { + "epoch": 4.313142239048134, + "grad_norm": 3.453125, + "learning_rate": 0.002876873841280313, + "loss": 2.7349, + "mean_token_accuracy": 0.441247820854187, + "num_tokens": 8153533380.0, + "step": 15950 + }, + { + "epoch": 4.313412655489453, + "grad_norm": 49.75, + "learning_rate": 0.002876195217882212, + "loss": 3.7924, + "mean_token_accuracy": 0.2922292649745941, + "num_tokens": 8154057521.0, + "step": 15951 + }, + { + "epoch": 4.3136830719307735, + "grad_norm": 8.0, + "learning_rate": 0.0028755168437463352, + "loss": 2.8852, + "mean_token_accuracy": 0.4340904653072357, + "num_tokens": 8154581772.0, + "step": 15952 + }, + { + "epoch": 4.313953488372093, + "grad_norm": 2.6875, + "learning_rate": 0.0028748387188934944, + "loss": 2.7181, + "mean_token_accuracy": 0.44387683272361755, + "num_tokens": 8155064208.0, + "step": 15953 + }, + { + "epoch": 4.314223904813413, + "grad_norm": 3.859375, + "learning_rate": 0.002874160843344494, + "loss": 2.7782, + "mean_token_accuracy": 0.42658770084381104, + "num_tokens": 8155588385.0, + "step": 15954 + }, + { + "epoch": 4.314494321254732, + "grad_norm": 3.1875, + "learning_rate": 0.0028734832171201366, + "loss": 2.7364, + "mean_token_accuracy": 0.43758895993232727, + "num_tokens": 8156112575.0, + "step": 15955 + }, + { + "epoch": 4.314764737696052, + "grad_norm": 2.875, + "learning_rate": 0.0028728058402412117, + "loss": 2.8136, + "mean_token_accuracy": 0.43448328971862793, + "num_tokens": 8156624145.0, + "step": 15956 + }, + { + "epoch": 4.315035154137371, + "grad_norm": 3.5625, + "learning_rate": 0.002872128712728504, + "loss": 2.9922, + "mean_token_accuracy": 0.4282519221305847, + "num_tokens": 8157094236.0, + "step": 15957 + }, + { + "epoch": 4.315305570578691, + "grad_norm": 3.671875, + "learning_rate": 0.002871451834602792, + "loss": 2.7532, + "mean_token_accuracy": 0.4334729313850403, + "num_tokens": 8157618278.0, + "step": 15958 + }, + { + "epoch": 4.3155759870200106, + "grad_norm": 4.0, + "learning_rate": 0.0028707752058848407, + "loss": 2.9484, + "mean_token_accuracy": 0.42922237515449524, + "num_tokens": 8158142471.0, + "step": 15959 + }, + { + "epoch": 4.315846403461331, + "grad_norm": 4.15625, + "learning_rate": 0.0028700988265954125, + "loss": 2.8482, + "mean_token_accuracy": 0.42171019315719604, + "num_tokens": 8158666752.0, + "step": 15960 + }, + { + "epoch": 4.31611681990265, + "grad_norm": 4.5625, + "learning_rate": 0.0028694226967552615, + "loss": 2.6307, + "mean_token_accuracy": 0.4120349884033203, + "num_tokens": 8159139763.0, + "step": 15961 + }, + { + "epoch": 4.31638723634397, + "grad_norm": 3.859375, + "learning_rate": 0.0028687468163851315, + "loss": 2.6321, + "mean_token_accuracy": 0.4605919122695923, + "num_tokens": 8159611286.0, + "step": 15962 + }, + { + "epoch": 4.316657652785289, + "grad_norm": 3.8125, + "learning_rate": 0.0028680711855057633, + "loss": 2.8683, + "mean_token_accuracy": 0.43497616052627563, + "num_tokens": 8160135434.0, + "step": 15963 + }, + { + "epoch": 4.316928069226609, + "grad_norm": 4.75, + "learning_rate": 0.0028673958041378856, + "loss": 2.8177, + "mean_token_accuracy": 0.4369608461856842, + "num_tokens": 8160659447.0, + "step": 15964 + }, + { + "epoch": 4.317198485667928, + "grad_norm": 3.625, + "learning_rate": 0.002866720672302219, + "loss": 2.7494, + "mean_token_accuracy": 0.4478672742843628, + "num_tokens": 8161183623.0, + "step": 15965 + }, + { + "epoch": 4.3174689021092485, + "grad_norm": 4.59375, + "learning_rate": 0.0028660457900194823, + "loss": 2.9141, + "mean_token_accuracy": 0.4365342855453491, + "num_tokens": 8161707832.0, + "step": 15966 + }, + { + "epoch": 4.317739318550568, + "grad_norm": 3.21875, + "learning_rate": 0.0028653711573103815, + "loss": 2.9932, + "mean_token_accuracy": 0.43562787771224976, + "num_tokens": 8162168330.0, + "step": 15967 + }, + { + "epoch": 4.318009734991888, + "grad_norm": 3.21875, + "learning_rate": 0.002864696774195614, + "loss": 2.932, + "mean_token_accuracy": 0.4004501700401306, + "num_tokens": 8162692521.0, + "step": 15968 + }, + { + "epoch": 4.318280151433207, + "grad_norm": 3.296875, + "learning_rate": 0.002864022640695875, + "loss": 2.7137, + "mean_token_accuracy": 0.44351375102996826, + "num_tokens": 8163216681.0, + "step": 15969 + }, + { + "epoch": 4.318550567874527, + "grad_norm": 7.90625, + "learning_rate": 0.002863348756831847, + "loss": 2.8338, + "mean_token_accuracy": 0.4362756907939911, + "num_tokens": 8163739974.0, + "step": 15970 + }, + { + "epoch": 4.318820984315846, + "grad_norm": 34.75, + "learning_rate": 0.0028626751226242093, + "loss": 2.7189, + "mean_token_accuracy": 0.44772258400917053, + "num_tokens": 8164264149.0, + "step": 15971 + }, + { + "epoch": 4.319091400757166, + "grad_norm": 4.0, + "learning_rate": 0.002862001738093627, + "loss": 2.7661, + "mean_token_accuracy": 0.4096611738204956, + "num_tokens": 8164788348.0, + "step": 15972 + }, + { + "epoch": 4.3193618171984856, + "grad_norm": 2.828125, + "learning_rate": 0.0028613286032607653, + "loss": 2.7963, + "mean_token_accuracy": 0.4315997064113617, + "num_tokens": 8165312625.0, + "step": 15973 + }, + { + "epoch": 4.319632233639806, + "grad_norm": 3.734375, + "learning_rate": 0.0028606557181462744, + "loss": 2.8478, + "mean_token_accuracy": 0.4473836123943329, + "num_tokens": 8165778757.0, + "step": 15974 + }, + { + "epoch": 4.319902650081125, + "grad_norm": 4.03125, + "learning_rate": 0.002859983082770804, + "loss": 2.7689, + "mean_token_accuracy": 0.47625041007995605, + "num_tokens": 8166239059.0, + "step": 15975 + }, + { + "epoch": 4.320173066522445, + "grad_norm": 3.625, + "learning_rate": 0.0028593106971549907, + "loss": 2.6386, + "mean_token_accuracy": 0.45174795389175415, + "num_tokens": 8166763178.0, + "step": 15976 + }, + { + "epoch": 4.320443482963764, + "grad_norm": 3.59375, + "learning_rate": 0.0028586385613194655, + "loss": 2.8466, + "mean_token_accuracy": 0.440990686416626, + "num_tokens": 8167287410.0, + "step": 15977 + }, + { + "epoch": 4.320713899405084, + "grad_norm": 4.25, + "learning_rate": 0.002857966675284851, + "loss": 2.8595, + "mean_token_accuracy": 0.44504043459892273, + "num_tokens": 8167811647.0, + "step": 15978 + }, + { + "epoch": 4.320984315846403, + "grad_norm": 4.40625, + "learning_rate": 0.0028572950390717638, + "loss": 2.6402, + "mean_token_accuracy": 0.43934011459350586, + "num_tokens": 8168313292.0, + "step": 15979 + }, + { + "epoch": 4.3212547322877235, + "grad_norm": 3.375, + "learning_rate": 0.002856623652700809, + "loss": 2.8512, + "mean_token_accuracy": 0.43341419100761414, + "num_tokens": 8168829618.0, + "step": 15980 + }, + { + "epoch": 4.321525148729043, + "grad_norm": 3.28125, + "learning_rate": 0.0028559525161925914, + "loss": 2.8315, + "mean_token_accuracy": 0.43771207332611084, + "num_tokens": 8169353701.0, + "step": 15981 + }, + { + "epoch": 4.321795565170362, + "grad_norm": 4.28125, + "learning_rate": 0.0028552816295677, + "loss": 2.9341, + "mean_token_accuracy": 0.41383451223373413, + "num_tokens": 8169877859.0, + "step": 15982 + }, + { + "epoch": 4.322065981611682, + "grad_norm": 3.796875, + "learning_rate": 0.0028546109928467185, + "loss": 2.8842, + "mean_token_accuracy": 0.43187958002090454, + "num_tokens": 8170398017.0, + "step": 15983 + }, + { + "epoch": 4.322336398053002, + "grad_norm": 52.5, + "learning_rate": 0.002853940606050227, + "loss": 3.3081, + "mean_token_accuracy": 0.3937799036502838, + "num_tokens": 8170910994.0, + "step": 15984 + }, + { + "epoch": 4.322606814494321, + "grad_norm": 5.9375, + "learning_rate": 0.0028532704691987924, + "loss": 2.7064, + "mean_token_accuracy": 0.44976773858070374, + "num_tokens": 8171415460.0, + "step": 15985 + }, + { + "epoch": 4.3228772309356405, + "grad_norm": 2.5, + "learning_rate": 0.002852600582312978, + "loss": 2.8637, + "mean_token_accuracy": 0.4323346018791199, + "num_tokens": 8171939734.0, + "step": 15986 + }, + { + "epoch": 4.323147647376961, + "grad_norm": 5.25, + "learning_rate": 0.0028519309454133395, + "loss": 2.8982, + "mean_token_accuracy": 0.4310844838619232, + "num_tokens": 8172463876.0, + "step": 15987 + }, + { + "epoch": 4.32341806381828, + "grad_norm": 4.3125, + "learning_rate": 0.0028512615585204195, + "loss": 2.7042, + "mean_token_accuracy": 0.4457739591598511, + "num_tokens": 8172956690.0, + "step": 15988 + }, + { + "epoch": 4.3236884802596, + "grad_norm": 4.28125, + "learning_rate": 0.002850592421654761, + "loss": 2.7634, + "mean_token_accuracy": 0.48336896300315857, + "num_tokens": 8173480946.0, + "step": 15989 + }, + { + "epoch": 4.323958896700919, + "grad_norm": 3.21875, + "learning_rate": 0.0028499235348368917, + "loss": 2.7222, + "mean_token_accuracy": 0.4259130358695984, + "num_tokens": 8174005040.0, + "step": 15990 + }, + { + "epoch": 4.324229313142239, + "grad_norm": 69.5, + "learning_rate": 0.0028492548980873353, + "loss": 4.8345, + "mean_token_accuracy": 0.2478857934474945, + "num_tokens": 8174468208.0, + "step": 15991 + }, + { + "epoch": 4.324499729583558, + "grad_norm": 6.46875, + "learning_rate": 0.0028485865114266096, + "loss": 2.8767, + "mean_token_accuracy": 0.45316940546035767, + "num_tokens": 8174988086.0, + "step": 15992 + }, + { + "epoch": 4.324770146024878, + "grad_norm": 3.59375, + "learning_rate": 0.002847918374875222, + "loss": 2.9612, + "mean_token_accuracy": 0.42682069540023804, + "num_tokens": 8175512361.0, + "step": 15993 + }, + { + "epoch": 4.325040562466198, + "grad_norm": 4.09375, + "learning_rate": 0.002847250488453671, + "loss": 2.7576, + "mean_token_accuracy": 0.44366616010665894, + "num_tokens": 8176013263.0, + "step": 15994 + }, + { + "epoch": 4.325310978907518, + "grad_norm": 3.5625, + "learning_rate": 0.002846582852182451, + "loss": 2.7655, + "mean_token_accuracy": 0.4355161488056183, + "num_tokens": 8176537473.0, + "step": 15995 + }, + { + "epoch": 4.325581395348837, + "grad_norm": 4.1875, + "learning_rate": 0.0028459154660820476, + "loss": 2.9583, + "mean_token_accuracy": 0.40554478764533997, + "num_tokens": 8177061573.0, + "step": 15996 + }, + { + "epoch": 4.325851811790157, + "grad_norm": 3.8125, + "learning_rate": 0.0028452483301729353, + "loss": 2.7361, + "mean_token_accuracy": 0.41922223567962646, + "num_tokens": 8177585758.0, + "step": 15997 + }, + { + "epoch": 4.326122228231476, + "grad_norm": 3.875, + "learning_rate": 0.002844581444475588, + "loss": 2.8489, + "mean_token_accuracy": 0.4111001193523407, + "num_tokens": 8178109802.0, + "step": 15998 + }, + { + "epoch": 4.326392644672796, + "grad_norm": 3.3125, + "learning_rate": 0.002843914809010464, + "loss": 2.9011, + "mean_token_accuracy": 0.43916603922843933, + "num_tokens": 8178576922.0, + "step": 15999 + }, + { + "epoch": 4.3266630611141155, + "grad_norm": 3.359375, + "learning_rate": 0.0028432484237980184, + "loss": 2.907, + "mean_token_accuracy": 0.42920827865600586, + "num_tokens": 8179040938.0, + "step": 16000 + }, + { + "epoch": 4.326933477555436, + "grad_norm": 3.546875, + "learning_rate": 0.002842582288858701, + "loss": 2.8981, + "mean_token_accuracy": 0.4578247666358948, + "num_tokens": 8179502150.0, + "step": 16001 + }, + { + "epoch": 4.327203893996755, + "grad_norm": 3.53125, + "learning_rate": 0.002841916404212948, + "loss": 2.8895, + "mean_token_accuracy": 0.42851853370666504, + "num_tokens": 8180026332.0, + "step": 16002 + }, + { + "epoch": 4.327474310438075, + "grad_norm": 3.640625, + "learning_rate": 0.0028412507698811883, + "loss": 2.6566, + "mean_token_accuracy": 0.45592451095581055, + "num_tokens": 8180550428.0, + "step": 16003 + }, + { + "epoch": 4.327744726879394, + "grad_norm": 3.703125, + "learning_rate": 0.0028405853858838514, + "loss": 2.8279, + "mean_token_accuracy": 0.45243555307388306, + "num_tokens": 8181050641.0, + "step": 16004 + }, + { + "epoch": 4.328015143320714, + "grad_norm": 3.25, + "learning_rate": 0.00283992025224135, + "loss": 2.8772, + "mean_token_accuracy": 0.41981837153434753, + "num_tokens": 8181574818.0, + "step": 16005 + }, + { + "epoch": 4.328285559762033, + "grad_norm": 3.71875, + "learning_rate": 0.0028392553689740906, + "loss": 2.8906, + "mean_token_accuracy": 0.4511275887489319, + "num_tokens": 8182098965.0, + "step": 16006 + }, + { + "epoch": 4.328555976203353, + "grad_norm": 3.984375, + "learning_rate": 0.002838590736102478, + "loss": 2.8034, + "mean_token_accuracy": 0.4364825487136841, + "num_tokens": 8182623089.0, + "step": 16007 + }, + { + "epoch": 4.328826392644673, + "grad_norm": 4.15625, + "learning_rate": 0.0028379263536468997, + "loss": 3.0026, + "mean_token_accuracy": 0.4112866520881653, + "num_tokens": 8183147341.0, + "step": 16008 + }, + { + "epoch": 4.329096809085993, + "grad_norm": 4.0625, + "learning_rate": 0.0028372622216277467, + "loss": 2.8928, + "mean_token_accuracy": 0.4386364817619324, + "num_tokens": 8183609590.0, + "step": 16009 + }, + { + "epoch": 4.329367225527312, + "grad_norm": 3.40625, + "learning_rate": 0.002836598340065394, + "loss": 2.9276, + "mean_token_accuracy": 0.41731297969818115, + "num_tokens": 8184133859.0, + "step": 16010 + }, + { + "epoch": 4.329637641968632, + "grad_norm": 57.75, + "learning_rate": 0.002835934708980209, + "loss": 4.6992, + "mean_token_accuracy": 0.2752346396446228, + "num_tokens": 8184658106.0, + "step": 16011 + }, + { + "epoch": 4.329908058409951, + "grad_norm": 7.75, + "learning_rate": 0.0028352713283925574, + "loss": 2.8051, + "mean_token_accuracy": 0.43484941124916077, + "num_tokens": 8185156621.0, + "step": 16012 + }, + { + "epoch": 4.330178474851271, + "grad_norm": 2.515625, + "learning_rate": 0.002834608198322791, + "loss": 2.7371, + "mean_token_accuracy": 0.41708284616470337, + "num_tokens": 8185680862.0, + "step": 16013 + }, + { + "epoch": 4.3304488912925905, + "grad_norm": 3.75, + "learning_rate": 0.0028339453187912595, + "loss": 2.7734, + "mean_token_accuracy": 0.451386034488678, + "num_tokens": 8186204937.0, + "step": 16014 + }, + { + "epoch": 4.330719307733911, + "grad_norm": 3.421875, + "learning_rate": 0.0028332826898182983, + "loss": 2.6961, + "mean_token_accuracy": 0.4413781762123108, + "num_tokens": 8186729116.0, + "step": 16015 + }, + { + "epoch": 4.33098972417523, + "grad_norm": 3.25, + "learning_rate": 0.0028326203114242437, + "loss": 2.8325, + "mean_token_accuracy": 0.44480153918266296, + "num_tokens": 8187200694.0, + "step": 16016 + }, + { + "epoch": 4.33126014061655, + "grad_norm": 4.0625, + "learning_rate": 0.002831958183629415, + "loss": 2.6799, + "mean_token_accuracy": 0.44240862131118774, + "num_tokens": 8187630164.0, + "step": 16017 + }, + { + "epoch": 4.331530557057869, + "grad_norm": 3.234375, + "learning_rate": 0.0028312963064541304, + "loss": 2.6762, + "mean_token_accuracy": 0.46332716941833496, + "num_tokens": 8188129969.0, + "step": 16018 + }, + { + "epoch": 4.331800973499189, + "grad_norm": 3.5, + "learning_rate": 0.002830634679918699, + "loss": 2.71, + "mean_token_accuracy": 0.4404720067977905, + "num_tokens": 8188651882.0, + "step": 16019 + }, + { + "epoch": 4.332071389940508, + "grad_norm": 4.3125, + "learning_rate": 0.002829973304043418, + "loss": 2.5797, + "mean_token_accuracy": 0.4382252097129822, + "num_tokens": 8189175969.0, + "step": 16020 + }, + { + "epoch": 4.332341806381828, + "grad_norm": 3.921875, + "learning_rate": 0.002829312178848583, + "loss": 2.5807, + "mean_token_accuracy": 0.4410463273525238, + "num_tokens": 8189700178.0, + "step": 16021 + }, + { + "epoch": 4.332612222823148, + "grad_norm": 3.8125, + "learning_rate": 0.0028286513043544797, + "loss": 2.7227, + "mean_token_accuracy": 0.4361899495124817, + "num_tokens": 8190224414.0, + "step": 16022 + }, + { + "epoch": 4.332882639264467, + "grad_norm": 3.328125, + "learning_rate": 0.002827990680581383, + "loss": 2.9108, + "mean_token_accuracy": 0.42797064781188965, + "num_tokens": 8190748690.0, + "step": 16023 + }, + { + "epoch": 4.333153055705787, + "grad_norm": 3.578125, + "learning_rate": 0.0028273303075495673, + "loss": 2.695, + "mean_token_accuracy": 0.46015942096710205, + "num_tokens": 8191272970.0, + "step": 16024 + }, + { + "epoch": 4.333423472147107, + "grad_norm": 3.859375, + "learning_rate": 0.002826670185279291, + "loss": 2.7781, + "mean_token_accuracy": 0.4191434383392334, + "num_tokens": 8191797088.0, + "step": 16025 + }, + { + "epoch": 4.333693888588426, + "grad_norm": 3.296875, + "learning_rate": 0.002826010313790807, + "loss": 2.9417, + "mean_token_accuracy": 0.43183159828186035, + "num_tokens": 8192321298.0, + "step": 16026 + }, + { + "epoch": 4.333964305029745, + "grad_norm": 3.84375, + "learning_rate": 0.0028253506931043663, + "loss": 2.766, + "mean_token_accuracy": 0.45040950179100037, + "num_tokens": 8192833740.0, + "step": 16027 + }, + { + "epoch": 4.3342347214710655, + "grad_norm": 3.8125, + "learning_rate": 0.002824691323240208, + "loss": 2.7777, + "mean_token_accuracy": 0.44638383388519287, + "num_tokens": 8193357919.0, + "step": 16028 + }, + { + "epoch": 4.334505137912385, + "grad_norm": 4.4375, + "learning_rate": 0.0028240322042185594, + "loss": 2.8788, + "mean_token_accuracy": 0.4399239420890808, + "num_tokens": 8193882192.0, + "step": 16029 + }, + { + "epoch": 4.334775554353705, + "grad_norm": 4.21875, + "learning_rate": 0.002823373336059649, + "loss": 2.7137, + "mean_token_accuracy": 0.4259001612663269, + "num_tokens": 8194406380.0, + "step": 16030 + }, + { + "epoch": 4.335045970795024, + "grad_norm": 107.0, + "learning_rate": 0.0028227147187836884, + "loss": 4.6275, + "mean_token_accuracy": 0.28550535440444946, + "num_tokens": 8194920650.0, + "step": 16031 + }, + { + "epoch": 4.335316387236344, + "grad_norm": 6.0625, + "learning_rate": 0.00282205635241089, + "loss": 2.9702, + "mean_token_accuracy": 0.4367392063140869, + "num_tokens": 8195444925.0, + "step": 16032 + }, + { + "epoch": 4.335586803677663, + "grad_norm": 2.703125, + "learning_rate": 0.0028213982369614528, + "loss": 2.8694, + "mean_token_accuracy": 0.43778693675994873, + "num_tokens": 8195969166.0, + "step": 16033 + }, + { + "epoch": 4.335857220118983, + "grad_norm": 3.4375, + "learning_rate": 0.002820740372455567, + "loss": 2.6515, + "mean_token_accuracy": 0.479317307472229, + "num_tokens": 8196428909.0, + "step": 16034 + }, + { + "epoch": 4.3361276365603025, + "grad_norm": 3.953125, + "learning_rate": 0.002820082758913423, + "loss": 2.6455, + "mean_token_accuracy": 0.44554683566093445, + "num_tokens": 8196953172.0, + "step": 16035 + }, + { + "epoch": 4.336398053001623, + "grad_norm": 3.328125, + "learning_rate": 0.0028194253963551943, + "loss": 2.8243, + "mean_token_accuracy": 0.41336148977279663, + "num_tokens": 8197477435.0, + "step": 16036 + }, + { + "epoch": 4.336668469442942, + "grad_norm": 3.71875, + "learning_rate": 0.0028187682848010505, + "loss": 2.9075, + "mean_token_accuracy": 0.448752224445343, + "num_tokens": 8197967512.0, + "step": 16037 + }, + { + "epoch": 4.336938885884262, + "grad_norm": 4.0625, + "learning_rate": 0.002818111424271157, + "loss": 2.9541, + "mean_token_accuracy": 0.4233528971672058, + "num_tokens": 8198491747.0, + "step": 16038 + }, + { + "epoch": 4.337209302325581, + "grad_norm": 3.640625, + "learning_rate": 0.002817454814785666, + "loss": 2.7903, + "mean_token_accuracy": 0.4521753191947937, + "num_tokens": 8198973304.0, + "step": 16039 + }, + { + "epoch": 4.337479718766901, + "grad_norm": 3.453125, + "learning_rate": 0.0028167984563647244, + "loss": 2.8043, + "mean_token_accuracy": 0.4516720175743103, + "num_tokens": 8199497501.0, + "step": 16040 + }, + { + "epoch": 4.33775013520822, + "grad_norm": 4.21875, + "learning_rate": 0.002816142349028471, + "loss": 2.7036, + "mean_token_accuracy": 0.45172932744026184, + "num_tokens": 8200021773.0, + "step": 16041 + }, + { + "epoch": 4.3380205516495405, + "grad_norm": 2.828125, + "learning_rate": 0.002815486492797037, + "loss": 2.7712, + "mean_token_accuracy": 0.4504010081291199, + "num_tokens": 8200546050.0, + "step": 16042 + }, + { + "epoch": 4.33829096809086, + "grad_norm": 4.5625, + "learning_rate": 0.0028148308876905464, + "loss": 2.8278, + "mean_token_accuracy": 0.45219293236732483, + "num_tokens": 8201070137.0, + "step": 16043 + }, + { + "epoch": 4.33856138453218, + "grad_norm": 3.53125, + "learning_rate": 0.0028141755337291177, + "loss": 2.6382, + "mean_token_accuracy": 0.4597115218639374, + "num_tokens": 8201547205.0, + "step": 16044 + }, + { + "epoch": 4.338831800973499, + "grad_norm": 3.453125, + "learning_rate": 0.0028135204309328544, + "loss": 2.7655, + "mean_token_accuracy": 0.45850592851638794, + "num_tokens": 8202071475.0, + "step": 16045 + }, + { + "epoch": 4.339102217414819, + "grad_norm": 4.40625, + "learning_rate": 0.0028128655793218595, + "loss": 2.7967, + "mean_token_accuracy": 0.4439266324043274, + "num_tokens": 8202595742.0, + "step": 16046 + }, + { + "epoch": 4.339372633856138, + "grad_norm": 4.0625, + "learning_rate": 0.0028122109789162253, + "loss": 2.9022, + "mean_token_accuracy": 0.4444148540496826, + "num_tokens": 8203119851.0, + "step": 16047 + }, + { + "epoch": 4.339643050297458, + "grad_norm": 4.03125, + "learning_rate": 0.0028115566297360383, + "loss": 2.6846, + "mean_token_accuracy": 0.44412487745285034, + "num_tokens": 8203633765.0, + "step": 16048 + }, + { + "epoch": 4.3399134667387775, + "grad_norm": 4.34375, + "learning_rate": 0.0028109025318013726, + "loss": 2.7205, + "mean_token_accuracy": 0.43776220083236694, + "num_tokens": 8204157981.0, + "step": 16049 + }, + { + "epoch": 4.340183883180098, + "grad_norm": 3.96875, + "learning_rate": 0.0028102486851323016, + "loss": 2.7084, + "mean_token_accuracy": 0.44721323251724243, + "num_tokens": 8204682256.0, + "step": 16050 + }, + { + "epoch": 4.340454299621417, + "grad_norm": 94.5, + "learning_rate": 0.002809595089748885, + "loss": 5.563, + "mean_token_accuracy": 0.23827789723873138, + "num_tokens": 8205150727.0, + "step": 16051 + }, + { + "epoch": 4.340724716062737, + "grad_norm": 7.84375, + "learning_rate": 0.0028089417456711764, + "loss": 2.8832, + "mean_token_accuracy": 0.4154118001461029, + "num_tokens": 8205674996.0, + "step": 16052 + }, + { + "epoch": 4.340995132504056, + "grad_norm": 6.5625, + "learning_rate": 0.0028082886529192247, + "loss": 2.6554, + "mean_token_accuracy": 0.48037028312683105, + "num_tokens": 8206199140.0, + "step": 16053 + }, + { + "epoch": 4.341265548945376, + "grad_norm": 3.640625, + "learning_rate": 0.0028076358115130652, + "loss": 2.9994, + "mean_token_accuracy": 0.4224814474582672, + "num_tokens": 8206712142.0, + "step": 16054 + }, + { + "epoch": 4.341535965386695, + "grad_norm": 3.28125, + "learning_rate": 0.0028069832214727337, + "loss": 2.6514, + "mean_token_accuracy": 0.45201563835144043, + "num_tokens": 8207236374.0, + "step": 16055 + }, + { + "epoch": 4.3418063818280155, + "grad_norm": 3.40625, + "learning_rate": 0.002806330882818249, + "loss": 2.6511, + "mean_token_accuracy": 0.4282528758049011, + "num_tokens": 8207755951.0, + "step": 16056 + }, + { + "epoch": 4.342076798269335, + "grad_norm": 3.25, + "learning_rate": 0.0028056787955696295, + "loss": 2.7593, + "mean_token_accuracy": 0.4424295723438263, + "num_tokens": 8208280199.0, + "step": 16057 + }, + { + "epoch": 4.342347214710655, + "grad_norm": 5.78125, + "learning_rate": 0.002805026959746883, + "loss": 2.5907, + "mean_token_accuracy": 0.47977787256240845, + "num_tokens": 8208804365.0, + "step": 16058 + }, + { + "epoch": 4.342617631151974, + "grad_norm": 3.28125, + "learning_rate": 0.002804375375370009, + "loss": 2.8666, + "mean_token_accuracy": 0.454446017742157, + "num_tokens": 8209328622.0, + "step": 16059 + }, + { + "epoch": 4.342888047593294, + "grad_norm": 3.34375, + "learning_rate": 0.0028037240424590004, + "loss": 2.8161, + "mean_token_accuracy": 0.44621676206588745, + "num_tokens": 8209804301.0, + "step": 16060 + }, + { + "epoch": 4.343158464034613, + "grad_norm": 3.4375, + "learning_rate": 0.0028030729610338417, + "loss": 2.6578, + "mean_token_accuracy": 0.4477497935295105, + "num_tokens": 8210328468.0, + "step": 16061 + }, + { + "epoch": 4.343428880475933, + "grad_norm": 3.375, + "learning_rate": 0.002802422131114511, + "loss": 2.8853, + "mean_token_accuracy": 0.4420965313911438, + "num_tokens": 8210852648.0, + "step": 16062 + }, + { + "epoch": 4.3436992969172525, + "grad_norm": 3.296875, + "learning_rate": 0.0028017715527209747, + "loss": 2.9711, + "mean_token_accuracy": 0.43073368072509766, + "num_tokens": 8211376831.0, + "step": 16063 + }, + { + "epoch": 4.343969713358572, + "grad_norm": 19.5, + "learning_rate": 0.002801121225873199, + "loss": 2.5445, + "mean_token_accuracy": 0.48213809728622437, + "num_tokens": 8211900964.0, + "step": 16064 + }, + { + "epoch": 4.344240129799892, + "grad_norm": 5.03125, + "learning_rate": 0.002800471150591135, + "loss": 2.7033, + "mean_token_accuracy": 0.4441341161727905, + "num_tokens": 8212373425.0, + "step": 16065 + }, + { + "epoch": 4.344510546241212, + "grad_norm": 2.359375, + "learning_rate": 0.002799821326894727, + "loss": 2.6998, + "mean_token_accuracy": 0.45352691411972046, + "num_tokens": 8212866695.0, + "step": 16066 + }, + { + "epoch": 4.344780962682531, + "grad_norm": 3.796875, + "learning_rate": 0.002799171754803919, + "loss": 2.8736, + "mean_token_accuracy": 0.41864100098609924, + "num_tokens": 8213390971.0, + "step": 16067 + }, + { + "epoch": 4.34505137912385, + "grad_norm": 4.0625, + "learning_rate": 0.0027985224343386374, + "loss": 2.9454, + "mean_token_accuracy": 0.4059905409812927, + "num_tokens": 8213878357.0, + "step": 16068 + }, + { + "epoch": 4.34532179556517, + "grad_norm": 3.109375, + "learning_rate": 0.002797873365518805, + "loss": 2.752, + "mean_token_accuracy": 0.44328027963638306, + "num_tokens": 8214363936.0, + "step": 16069 + }, + { + "epoch": 4.34559221200649, + "grad_norm": 3.609375, + "learning_rate": 0.0027972245483643392, + "loss": 2.6879, + "mean_token_accuracy": 0.4268566966056824, + "num_tokens": 8214888187.0, + "step": 16070 + }, + { + "epoch": 4.34586262844781, + "grad_norm": 33.75, + "learning_rate": 0.002796575982895148, + "loss": 4.0331, + "mean_token_accuracy": 0.3569950461387634, + "num_tokens": 8215409777.0, + "step": 16071 + }, + { + "epoch": 4.346133044889129, + "grad_norm": 6.3125, + "learning_rate": 0.0027959276691311295, + "loss": 2.841, + "mean_token_accuracy": 0.42741936445236206, + "num_tokens": 8215933960.0, + "step": 16072 + }, + { + "epoch": 4.346403461330449, + "grad_norm": 2.640625, + "learning_rate": 0.002795279607092178, + "loss": 2.7561, + "mean_token_accuracy": 0.4355362355709076, + "num_tokens": 8216458179.0, + "step": 16073 + }, + { + "epoch": 4.346673877771768, + "grad_norm": 3.25, + "learning_rate": 0.0027946317967981766, + "loss": 2.7193, + "mean_token_accuracy": 0.4697591960430145, + "num_tokens": 8216934492.0, + "step": 16074 + }, + { + "epoch": 4.346944294213088, + "grad_norm": 3.453125, + "learning_rate": 0.0027939842382690007, + "loss": 2.7611, + "mean_token_accuracy": 0.43468424677848816, + "num_tokens": 8217458726.0, + "step": 16075 + }, + { + "epoch": 4.3472147106544075, + "grad_norm": 2.875, + "learning_rate": 0.002793336931524522, + "loss": 2.731, + "mean_token_accuracy": 0.4324820637702942, + "num_tokens": 8217982921.0, + "step": 16076 + }, + { + "epoch": 4.3474851270957275, + "grad_norm": 4.25, + "learning_rate": 0.002792689876584599, + "loss": 2.684, + "mean_token_accuracy": 0.4680718779563904, + "num_tokens": 8218506980.0, + "step": 16077 + }, + { + "epoch": 4.347755543537047, + "grad_norm": 3.359375, + "learning_rate": 0.0027920430734690873, + "loss": 2.9002, + "mean_token_accuracy": 0.4356497526168823, + "num_tokens": 8219031135.0, + "step": 16078 + }, + { + "epoch": 4.348025959978367, + "grad_norm": 3.59375, + "learning_rate": 0.002791396522197834, + "loss": 2.8895, + "mean_token_accuracy": 0.44232067465782166, + "num_tokens": 8219516118.0, + "step": 16079 + }, + { + "epoch": 4.348296376419686, + "grad_norm": 3.609375, + "learning_rate": 0.002790750222790673, + "loss": 2.7456, + "mean_token_accuracy": 0.4440170228481293, + "num_tokens": 8220003196.0, + "step": 16080 + }, + { + "epoch": 4.348566792861006, + "grad_norm": 3.296875, + "learning_rate": 0.002790104175267437, + "loss": 2.7915, + "mean_token_accuracy": 0.4414137899875641, + "num_tokens": 8220483085.0, + "step": 16081 + }, + { + "epoch": 4.348837209302325, + "grad_norm": 3.6875, + "learning_rate": 0.0027894583796479505, + "loss": 2.9507, + "mean_token_accuracy": 0.42702317237854004, + "num_tokens": 8221007190.0, + "step": 16082 + }, + { + "epoch": 4.349107625743645, + "grad_norm": 3.65625, + "learning_rate": 0.002788812835952024, + "loss": 2.6546, + "mean_token_accuracy": 0.4390760660171509, + "num_tokens": 8221531349.0, + "step": 16083 + }, + { + "epoch": 4.349378042184965, + "grad_norm": 4.5625, + "learning_rate": 0.0027881675441994696, + "loss": 2.6135, + "mean_token_accuracy": 0.4324800372123718, + "num_tokens": 8222055626.0, + "step": 16084 + }, + { + "epoch": 4.349648458626285, + "grad_norm": 6.625, + "learning_rate": 0.002787522504410083, + "loss": 2.3043, + "mean_token_accuracy": 0.5010716915130615, + "num_tokens": 8222551915.0, + "step": 16085 + }, + { + "epoch": 4.349918875067604, + "grad_norm": 2.78125, + "learning_rate": 0.0027868777166036563, + "loss": 2.8337, + "mean_token_accuracy": 0.4332233667373657, + "num_tokens": 8223076004.0, + "step": 16086 + }, + { + "epoch": 4.350189291508924, + "grad_norm": 3.578125, + "learning_rate": 0.0027862331807999765, + "loss": 2.8232, + "mean_token_accuracy": 0.45687127113342285, + "num_tokens": 8223554142.0, + "step": 16087 + }, + { + "epoch": 4.350459707950243, + "grad_norm": 3.546875, + "learning_rate": 0.0027855888970188185, + "loss": 2.6048, + "mean_token_accuracy": 0.4545014202594757, + "num_tokens": 8224078282.0, + "step": 16088 + }, + { + "epoch": 4.350730124391563, + "grad_norm": 3.59375, + "learning_rate": 0.0027849448652799477, + "loss": 2.9095, + "mean_token_accuracy": 0.4387246370315552, + "num_tokens": 8224567049.0, + "step": 16089 + }, + { + "epoch": 4.3510005408328825, + "grad_norm": 4.25, + "learning_rate": 0.0027843010856031286, + "loss": 2.8923, + "mean_token_accuracy": 0.44626396894454956, + "num_tokens": 8225091169.0, + "step": 16090 + }, + { + "epoch": 4.3512709572742025, + "grad_norm": 63.25, + "learning_rate": 0.0027836575580081134, + "loss": 4.0676, + "mean_token_accuracy": 0.32736021280288696, + "num_tokens": 8225615410.0, + "step": 16091 + }, + { + "epoch": 4.351541373715522, + "grad_norm": 7.53125, + "learning_rate": 0.002783014282514646, + "loss": 2.7502, + "mean_token_accuracy": 0.4251788258552551, + "num_tokens": 8226104244.0, + "step": 16092 + }, + { + "epoch": 4.351811790156842, + "grad_norm": 2.59375, + "learning_rate": 0.0027823712591424662, + "loss": 2.901, + "mean_token_accuracy": 0.43465709686279297, + "num_tokens": 8226628396.0, + "step": 16093 + }, + { + "epoch": 4.352082206598161, + "grad_norm": 4.59375, + "learning_rate": 0.0027817284879113023, + "loss": 2.8568, + "mean_token_accuracy": 0.4545075595378876, + "num_tokens": 8227152653.0, + "step": 16094 + }, + { + "epoch": 4.352352623039481, + "grad_norm": 3.40625, + "learning_rate": 0.0027810859688408756, + "loss": 2.6285, + "mean_token_accuracy": 0.4456718862056732, + "num_tokens": 8227676783.0, + "step": 16095 + }, + { + "epoch": 4.3526230394808, + "grad_norm": 4.28125, + "learning_rate": 0.0027804437019509033, + "loss": 2.8556, + "mean_token_accuracy": 0.4356306493282318, + "num_tokens": 8228197392.0, + "step": 16096 + }, + { + "epoch": 4.35289345592212, + "grad_norm": 3.265625, + "learning_rate": 0.0027798016872610876, + "loss": 2.7807, + "mean_token_accuracy": 0.44932013750076294, + "num_tokens": 8228671284.0, + "step": 16097 + }, + { + "epoch": 4.35316387236344, + "grad_norm": 4.1875, + "learning_rate": 0.0027791599247911337, + "loss": 2.8622, + "mean_token_accuracy": 0.4358082115650177, + "num_tokens": 8229195438.0, + "step": 16098 + }, + { + "epoch": 4.35343428880476, + "grad_norm": 3.9375, + "learning_rate": 0.002778518414560726, + "loss": 2.9572, + "mean_token_accuracy": 0.428280234336853, + "num_tokens": 8229664149.0, + "step": 16099 + }, + { + "epoch": 4.353704705246079, + "grad_norm": 3.78125, + "learning_rate": 0.002777877156589551, + "loss": 2.8016, + "mean_token_accuracy": 0.4423099756240845, + "num_tokens": 8230188248.0, + "step": 16100 + }, + { + "epoch": 4.353975121687399, + "grad_norm": 3.859375, + "learning_rate": 0.0027772361508972875, + "loss": 2.733, + "mean_token_accuracy": 0.45081329345703125, + "num_tokens": 8230637507.0, + "step": 16101 + }, + { + "epoch": 4.354245538128718, + "grad_norm": 3.59375, + "learning_rate": 0.0027765953975036, + "loss": 2.9021, + "mean_token_accuracy": 0.44665658473968506, + "num_tokens": 8231110461.0, + "step": 16102 + }, + { + "epoch": 4.354515954570038, + "grad_norm": 3.40625, + "learning_rate": 0.0027759548964281476, + "loss": 2.722, + "mean_token_accuracy": 0.45558667182922363, + "num_tokens": 8231617101.0, + "step": 16103 + }, + { + "epoch": 4.3547863710113575, + "grad_norm": 3.46875, + "learning_rate": 0.002775314647690585, + "loss": 2.6432, + "mean_token_accuracy": 0.4556301236152649, + "num_tokens": 8232141294.0, + "step": 16104 + }, + { + "epoch": 4.355056787452677, + "grad_norm": 3.4375, + "learning_rate": 0.002774674651310557, + "loss": 2.9185, + "mean_token_accuracy": 0.42263996601104736, + "num_tokens": 8232665481.0, + "step": 16105 + }, + { + "epoch": 4.355327203893997, + "grad_norm": 3.796875, + "learning_rate": 0.002774034907307698, + "loss": 2.8448, + "mean_token_accuracy": 0.44206908345222473, + "num_tokens": 8233135441.0, + "step": 16106 + }, + { + "epoch": 4.355597620335317, + "grad_norm": 4.03125, + "learning_rate": 0.002773395415701641, + "loss": 2.9088, + "mean_token_accuracy": 0.40865230560302734, + "num_tokens": 8233659572.0, + "step": 16107 + }, + { + "epoch": 4.355868036776636, + "grad_norm": 3.9375, + "learning_rate": 0.002772756176512006, + "loss": 2.7247, + "mean_token_accuracy": 0.4292203187942505, + "num_tokens": 8234183718.0, + "step": 16108 + }, + { + "epoch": 4.356138453217955, + "grad_norm": 3.421875, + "learning_rate": 0.002772117189758403, + "loss": 2.7957, + "mean_token_accuracy": 0.4271707236766815, + "num_tokens": 8234707894.0, + "step": 16109 + }, + { + "epoch": 4.356408869659275, + "grad_norm": 4.28125, + "learning_rate": 0.0027714784554604445, + "loss": 2.6445, + "mean_token_accuracy": 0.4406440854072571, + "num_tokens": 8235232014.0, + "step": 16110 + }, + { + "epoch": 4.3566792861005945, + "grad_norm": 58.75, + "learning_rate": 0.002770839973637725, + "loss": 4.1591, + "mean_token_accuracy": 0.38839614391326904, + "num_tokens": 8235745847.0, + "step": 16111 + }, + { + "epoch": 4.356949702541915, + "grad_norm": 9.1875, + "learning_rate": 0.0027702017443098337, + "loss": 2.8406, + "mean_token_accuracy": 0.41123512387275696, + "num_tokens": 8236269979.0, + "step": 16112 + }, + { + "epoch": 4.357220118983234, + "grad_norm": 2.6875, + "learning_rate": 0.0027695637674963547, + "loss": 2.8548, + "mean_token_accuracy": 0.41709867119789124, + "num_tokens": 8236794123.0, + "step": 16113 + }, + { + "epoch": 4.357490535424554, + "grad_norm": 4.0, + "learning_rate": 0.0027689260432168655, + "loss": 2.9617, + "mean_token_accuracy": 0.42011556029319763, + "num_tokens": 8237318382.0, + "step": 16114 + }, + { + "epoch": 4.357760951865873, + "grad_norm": 3.515625, + "learning_rate": 0.0027682885714909295, + "loss": 2.7015, + "mean_token_accuracy": 0.43328723311424255, + "num_tokens": 8237842511.0, + "step": 16115 + }, + { + "epoch": 4.358031368307193, + "grad_norm": 3.140625, + "learning_rate": 0.0027676513523381095, + "loss": 2.834, + "mean_token_accuracy": 0.44248682260513306, + "num_tokens": 8238309225.0, + "step": 16116 + }, + { + "epoch": 4.358301784748512, + "grad_norm": 3.359375, + "learning_rate": 0.002767014385777956, + "loss": 2.8086, + "mean_token_accuracy": 0.437348872423172, + "num_tokens": 8238833457.0, + "step": 16117 + }, + { + "epoch": 4.3585722011898325, + "grad_norm": 3.96875, + "learning_rate": 0.002766377671830011, + "loss": 2.6982, + "mean_token_accuracy": 0.44483816623687744, + "num_tokens": 8239357682.0, + "step": 16118 + }, + { + "epoch": 4.358842617631152, + "grad_norm": 3.015625, + "learning_rate": 0.0027657412105138147, + "loss": 2.893, + "mean_token_accuracy": 0.44511955976486206, + "num_tokens": 8239881961.0, + "step": 16119 + }, + { + "epoch": 4.359113034072472, + "grad_norm": 3.859375, + "learning_rate": 0.0027651050018488938, + "loss": 2.8239, + "mean_token_accuracy": 0.4486904740333557, + "num_tokens": 8240406241.0, + "step": 16120 + }, + { + "epoch": 4.359383450513791, + "grad_norm": 3.53125, + "learning_rate": 0.0027644690458547656, + "loss": 2.8771, + "mean_token_accuracy": 0.43182116746902466, + "num_tokens": 8240930520.0, + "step": 16121 + }, + { + "epoch": 4.359653866955111, + "grad_norm": 3.421875, + "learning_rate": 0.0027638333425509484, + "loss": 2.8335, + "mean_token_accuracy": 0.4243272840976715, + "num_tokens": 8241454781.0, + "step": 16122 + }, + { + "epoch": 4.35992428339643, + "grad_norm": 3.671875, + "learning_rate": 0.002763197891956944, + "loss": 2.9305, + "mean_token_accuracy": 0.4341946840286255, + "num_tokens": 8241979022.0, + "step": 16123 + }, + { + "epoch": 4.36019469983775, + "grad_norm": 4.25, + "learning_rate": 0.002762562694092252, + "loss": 2.8863, + "mean_token_accuracy": 0.4271395206451416, + "num_tokens": 8242503234.0, + "step": 16124 + }, + { + "epoch": 4.3604651162790695, + "grad_norm": 4.03125, + "learning_rate": 0.0027619277489763622, + "loss": 2.9331, + "mean_token_accuracy": 0.42853572964668274, + "num_tokens": 8243027520.0, + "step": 16125 + }, + { + "epoch": 4.36073553272039, + "grad_norm": 3.375, + "learning_rate": 0.0027612930566287535, + "loss": 2.7197, + "mean_token_accuracy": 0.42644745111465454, + "num_tokens": 8243551665.0, + "step": 16126 + }, + { + "epoch": 4.361005949161709, + "grad_norm": 3.375, + "learning_rate": 0.002760658617068903, + "loss": 2.5741, + "mean_token_accuracy": 0.4539145827293396, + "num_tokens": 8244075672.0, + "step": 16127 + }, + { + "epoch": 4.361276365603029, + "grad_norm": 3.84375, + "learning_rate": 0.0027600244303162768, + "loss": 2.7859, + "mean_token_accuracy": 0.440030038356781, + "num_tokens": 8244599935.0, + "step": 16128 + }, + { + "epoch": 4.361546782044348, + "grad_norm": 4.3125, + "learning_rate": 0.0027593904963903325, + "loss": 2.8826, + "mean_token_accuracy": 0.44459623098373413, + "num_tokens": 8245112857.0, + "step": 16129 + }, + { + "epoch": 4.361817198485668, + "grad_norm": 4.15625, + "learning_rate": 0.0027587568153105238, + "loss": 2.8684, + "mean_token_accuracy": 0.44002565741539, + "num_tokens": 8245637041.0, + "step": 16130 + }, + { + "epoch": 4.362087614926987, + "grad_norm": 21.5, + "learning_rate": 0.002758123387096291, + "loss": 3.448, + "mean_token_accuracy": 0.406586229801178, + "num_tokens": 8246161312.0, + "step": 16131 + }, + { + "epoch": 4.3623580313683075, + "grad_norm": 7.90625, + "learning_rate": 0.0027574902117670696, + "loss": 2.6982, + "mean_token_accuracy": 0.4210583567619324, + "num_tokens": 8246685564.0, + "step": 16132 + }, + { + "epoch": 4.362628447809627, + "grad_norm": 2.875, + "learning_rate": 0.0027568572893422894, + "loss": 2.7938, + "mean_token_accuracy": 0.44234418869018555, + "num_tokens": 8247209574.0, + "step": 16133 + }, + { + "epoch": 4.362898864250947, + "grad_norm": 3.4375, + "learning_rate": 0.0027562246198413686, + "loss": 2.8084, + "mean_token_accuracy": 0.4401415288448334, + "num_tokens": 8247679316.0, + "step": 16134 + }, + { + "epoch": 4.363169280692266, + "grad_norm": 3.1875, + "learning_rate": 0.00275559220328372, + "loss": 2.8153, + "mean_token_accuracy": 0.44462692737579346, + "num_tokens": 8248161243.0, + "step": 16135 + }, + { + "epoch": 4.363439697133586, + "grad_norm": 4.3125, + "learning_rate": 0.0027549600396887474, + "loss": 2.7205, + "mean_token_accuracy": 0.43328070640563965, + "num_tokens": 8248685514.0, + "step": 16136 + }, + { + "epoch": 4.363710113574905, + "grad_norm": 3.265625, + "learning_rate": 0.002754328129075848, + "loss": 2.8021, + "mean_token_accuracy": 0.42411595582962036, + "num_tokens": 8249209715.0, + "step": 16137 + }, + { + "epoch": 4.363980530016225, + "grad_norm": 3.671875, + "learning_rate": 0.0027536964714644104, + "loss": 2.7747, + "mean_token_accuracy": 0.4473828673362732, + "num_tokens": 8249733916.0, + "step": 16138 + }, + { + "epoch": 4.3642509464575445, + "grad_norm": 4.21875, + "learning_rate": 0.0027530650668738162, + "loss": 2.7419, + "mean_token_accuracy": 0.42562437057495117, + "num_tokens": 8250258170.0, + "step": 16139 + }, + { + "epoch": 4.364521362898865, + "grad_norm": 3.15625, + "learning_rate": 0.0027524339153234367, + "loss": 2.7958, + "mean_token_accuracy": 0.4380583167076111, + "num_tokens": 8250782373.0, + "step": 16140 + }, + { + "epoch": 4.364791779340184, + "grad_norm": 4.0625, + "learning_rate": 0.0027518030168326418, + "loss": 2.6686, + "mean_token_accuracy": 0.4452664256095886, + "num_tokens": 8251306651.0, + "step": 16141 + }, + { + "epoch": 4.365062195781504, + "grad_norm": 3.53125, + "learning_rate": 0.002751172371420785, + "loss": 2.7686, + "mean_token_accuracy": 0.43479809165000916, + "num_tokens": 8251830891.0, + "step": 16142 + }, + { + "epoch": 4.365332612222823, + "grad_norm": 3.8125, + "learning_rate": 0.0027505419791072194, + "loss": 2.8992, + "mean_token_accuracy": 0.43259790539741516, + "num_tokens": 8252355049.0, + "step": 16143 + }, + { + "epoch": 4.365603028664143, + "grad_norm": 3.84375, + "learning_rate": 0.0027499118399112845, + "loss": 2.7529, + "mean_token_accuracy": 0.4310670793056488, + "num_tokens": 8252879217.0, + "step": 16144 + }, + { + "epoch": 4.365873445105462, + "grad_norm": 3.5625, + "learning_rate": 0.0027492819538523167, + "loss": 2.8457, + "mean_token_accuracy": 0.4302626848220825, + "num_tokens": 8253403397.0, + "step": 16145 + }, + { + "epoch": 4.366143861546782, + "grad_norm": 3.796875, + "learning_rate": 0.002748652320949642, + "loss": 2.7834, + "mean_token_accuracy": 0.4356287121772766, + "num_tokens": 8253927672.0, + "step": 16146 + }, + { + "epoch": 4.366414277988102, + "grad_norm": 3.953125, + "learning_rate": 0.002748022941222581, + "loss": 2.9413, + "mean_token_accuracy": 0.43165722489356995, + "num_tokens": 8254451856.0, + "step": 16147 + }, + { + "epoch": 4.366684694429422, + "grad_norm": 4.90625, + "learning_rate": 0.0027473938146904443, + "loss": 2.9284, + "mean_token_accuracy": 0.44025614857673645, + "num_tokens": 8254931555.0, + "step": 16148 + }, + { + "epoch": 4.366955110870741, + "grad_norm": 3.65625, + "learning_rate": 0.0027467649413725316, + "loss": 2.7418, + "mean_token_accuracy": 0.4401967525482178, + "num_tokens": 8255404092.0, + "step": 16149 + }, + { + "epoch": 4.36722552731206, + "grad_norm": 3.328125, + "learning_rate": 0.0027461363212881442, + "loss": 2.6401, + "mean_token_accuracy": 0.4980805218219757, + "num_tokens": 8255864128.0, + "step": 16150 + }, + { + "epoch": 4.36749594375338, + "grad_norm": 71.5, + "learning_rate": 0.0027455079544565675, + "loss": 3.735, + "mean_token_accuracy": 0.3721715211868286, + "num_tokens": 8256388347.0, + "step": 16151 + }, + { + "epoch": 4.367766360194699, + "grad_norm": 8.5625, + "learning_rate": 0.00274487984089708, + "loss": 2.6633, + "mean_token_accuracy": 0.4549105167388916, + "num_tokens": 8256863218.0, + "step": 16152 + }, + { + "epoch": 4.3680367766360195, + "grad_norm": 2.515625, + "learning_rate": 0.0027442519806289566, + "loss": 2.8291, + "mean_token_accuracy": 0.4506685435771942, + "num_tokens": 8257387503.0, + "step": 16153 + }, + { + "epoch": 4.368307193077339, + "grad_norm": 3.359375, + "learning_rate": 0.0027436243736714615, + "loss": 2.6544, + "mean_token_accuracy": 0.447659969329834, + "num_tokens": 8257858270.0, + "step": 16154 + }, + { + "epoch": 4.368577609518659, + "grad_norm": 3.921875, + "learning_rate": 0.00274299702004385, + "loss": 2.8702, + "mean_token_accuracy": 0.4326283633708954, + "num_tokens": 8258382475.0, + "step": 16155 + }, + { + "epoch": 4.368848025959978, + "grad_norm": 4.1875, + "learning_rate": 0.0027423699197653716, + "loss": 2.7628, + "mean_token_accuracy": 0.443278968334198, + "num_tokens": 8258906721.0, + "step": 16156 + }, + { + "epoch": 4.369118442401298, + "grad_norm": 4.03125, + "learning_rate": 0.0027417430728552705, + "loss": 2.7216, + "mean_token_accuracy": 0.4457694888114929, + "num_tokens": 8259430853.0, + "step": 16157 + }, + { + "epoch": 4.369388858842617, + "grad_norm": 3.84375, + "learning_rate": 0.0027411164793327758, + "loss": 2.733, + "mean_token_accuracy": 0.45152151584625244, + "num_tokens": 8259897971.0, + "step": 16158 + }, + { + "epoch": 4.369659275283937, + "grad_norm": 4.5, + "learning_rate": 0.0027404901392171174, + "loss": 2.703, + "mean_token_accuracy": 0.42437559366226196, + "num_tokens": 8260422056.0, + "step": 16159 + }, + { + "epoch": 4.369929691725257, + "grad_norm": 3.203125, + "learning_rate": 0.00273986405252751, + "loss": 2.8199, + "mean_token_accuracy": 0.45800894498825073, + "num_tokens": 8260865619.0, + "step": 16160 + }, + { + "epoch": 4.370200108166577, + "grad_norm": 3.59375, + "learning_rate": 0.002739238219283164, + "loss": 2.5754, + "mean_token_accuracy": 0.44521093368530273, + "num_tokens": 8261389880.0, + "step": 16161 + }, + { + "epoch": 4.370470524607896, + "grad_norm": 3.65625, + "learning_rate": 0.002738612639503284, + "loss": 2.7586, + "mean_token_accuracy": 0.4396205544471741, + "num_tokens": 8261914096.0, + "step": 16162 + }, + { + "epoch": 4.370740941049216, + "grad_norm": 3.875, + "learning_rate": 0.002737987313207064, + "loss": 2.6422, + "mean_token_accuracy": 0.4307039976119995, + "num_tokens": 8262438243.0, + "step": 16163 + }, + { + "epoch": 4.371011357490535, + "grad_norm": 3.234375, + "learning_rate": 0.0027373622404136882, + "loss": 2.7773, + "mean_token_accuracy": 0.45612001419067383, + "num_tokens": 8262940839.0, + "step": 16164 + }, + { + "epoch": 4.371281773931855, + "grad_norm": 3.578125, + "learning_rate": 0.002736737421142339, + "loss": 2.7243, + "mean_token_accuracy": 0.4365212917327881, + "num_tokens": 8263421154.0, + "step": 16165 + }, + { + "epoch": 4.371552190373174, + "grad_norm": 3.28125, + "learning_rate": 0.0027361128554121874, + "loss": 2.8114, + "mean_token_accuracy": 0.4408436417579651, + "num_tokens": 8263945377.0, + "step": 16166 + }, + { + "epoch": 4.3718226068144945, + "grad_norm": 3.25, + "learning_rate": 0.002735488543242393, + "loss": 2.6263, + "mean_token_accuracy": 0.4520202875137329, + "num_tokens": 8264469662.0, + "step": 16167 + }, + { + "epoch": 4.372093023255814, + "grad_norm": 3.015625, + "learning_rate": 0.002734864484652117, + "loss": 2.8779, + "mean_token_accuracy": 0.43388503789901733, + "num_tokens": 8264993879.0, + "step": 16168 + }, + { + "epoch": 4.372363439697134, + "grad_norm": 4.25, + "learning_rate": 0.002734240679660503, + "loss": 2.8188, + "mean_token_accuracy": 0.45275571942329407, + "num_tokens": 8265518068.0, + "step": 16169 + }, + { + "epoch": 4.372633856138453, + "grad_norm": 4.0, + "learning_rate": 0.0027336171282866927, + "loss": 2.8537, + "mean_token_accuracy": 0.42925703525543213, + "num_tokens": 8266042190.0, + "step": 16170 + }, + { + "epoch": 4.372904272579773, + "grad_norm": 48.25, + "learning_rate": 0.0027329938305498203, + "loss": 4.7605, + "mean_token_accuracy": 0.24821837246418, + "num_tokens": 8266566467.0, + "step": 16171 + }, + { + "epoch": 4.373174689021092, + "grad_norm": 7.125, + "learning_rate": 0.002732370786469008, + "loss": 2.8343, + "mean_token_accuracy": 0.4371631145477295, + "num_tokens": 8267090743.0, + "step": 16172 + }, + { + "epoch": 4.373445105462412, + "grad_norm": 2.53125, + "learning_rate": 0.002731747996063374, + "loss": 2.894, + "mean_token_accuracy": 0.4334844946861267, + "num_tokens": 8267614881.0, + "step": 16173 + }, + { + "epoch": 4.373715521903732, + "grad_norm": 3.484375, + "learning_rate": 0.0027311254593520274, + "loss": 2.8892, + "mean_token_accuracy": 0.41855260729789734, + "num_tokens": 8268139051.0, + "step": 16174 + }, + { + "epoch": 4.373985938345052, + "grad_norm": 2.984375, + "learning_rate": 0.0027305031763540676, + "loss": 2.7955, + "mean_token_accuracy": 0.4551364779472351, + "num_tokens": 8268616165.0, + "step": 16175 + }, + { + "epoch": 4.374256354786371, + "grad_norm": 4.5625, + "learning_rate": 0.002729881147088591, + "loss": 2.8022, + "mean_token_accuracy": 0.4462168216705322, + "num_tokens": 8269140367.0, + "step": 16176 + }, + { + "epoch": 4.374526771227691, + "grad_norm": 4.875, + "learning_rate": 0.0027292593715746807, + "loss": 2.9412, + "mean_token_accuracy": 0.4243949055671692, + "num_tokens": 8269664452.0, + "step": 16177 + }, + { + "epoch": 4.37479718766901, + "grad_norm": 4.21875, + "learning_rate": 0.0027286378498314148, + "loss": 2.7959, + "mean_token_accuracy": 0.4394664764404297, + "num_tokens": 8270150866.0, + "step": 16178 + }, + { + "epoch": 4.37506760411033, + "grad_norm": 3.5, + "learning_rate": 0.002728016581877865, + "loss": 2.9735, + "mean_token_accuracy": 0.4273431897163391, + "num_tokens": 8270675119.0, + "step": 16179 + }, + { + "epoch": 4.375338020551649, + "grad_norm": 4.09375, + "learning_rate": 0.002727395567733094, + "loss": 2.9001, + "mean_token_accuracy": 0.4322075843811035, + "num_tokens": 8271142138.0, + "step": 16180 + }, + { + "epoch": 4.3756084369929695, + "grad_norm": 3.640625, + "learning_rate": 0.0027267748074161526, + "loss": 2.7111, + "mean_token_accuracy": 0.43832263350486755, + "num_tokens": 8271666414.0, + "step": 16181 + }, + { + "epoch": 4.375878853434289, + "grad_norm": 3.578125, + "learning_rate": 0.0027261543009460916, + "loss": 2.8307, + "mean_token_accuracy": 0.4342527389526367, + "num_tokens": 8272190581.0, + "step": 16182 + }, + { + "epoch": 4.376149269875609, + "grad_norm": 4.375, + "learning_rate": 0.0027255340483419467, + "loss": 2.8682, + "mean_token_accuracy": 0.4324861764907837, + "num_tokens": 8272714817.0, + "step": 16183 + }, + { + "epoch": 4.376419686316928, + "grad_norm": 4.125, + "learning_rate": 0.002724914049622753, + "loss": 2.8437, + "mean_token_accuracy": 0.435670405626297, + "num_tokens": 8273239062.0, + "step": 16184 + }, + { + "epoch": 4.376690102758248, + "grad_norm": 4.3125, + "learning_rate": 0.0027242943048075303, + "loss": 2.6623, + "mean_token_accuracy": 0.46414875984191895, + "num_tokens": 8273763273.0, + "step": 16185 + }, + { + "epoch": 4.376960519199567, + "grad_norm": 3.8125, + "learning_rate": 0.002723674813915296, + "loss": 2.8524, + "mean_token_accuracy": 0.4387196898460388, + "num_tokens": 8274287535.0, + "step": 16186 + }, + { + "epoch": 4.3772309356408865, + "grad_norm": 4.40625, + "learning_rate": 0.0027230555769650576, + "loss": 2.7539, + "mean_token_accuracy": 0.46043938398361206, + "num_tokens": 8274785292.0, + "step": 16187 + }, + { + "epoch": 4.377501352082207, + "grad_norm": 4.1875, + "learning_rate": 0.002722436593975815, + "loss": 2.6382, + "mean_token_accuracy": 0.4436706304550171, + "num_tokens": 8275309535.0, + "step": 16188 + }, + { + "epoch": 4.377771768523527, + "grad_norm": 5.0, + "learning_rate": 0.0027218178649665616, + "loss": 2.552, + "mean_token_accuracy": 0.47282737493515015, + "num_tokens": 8275811693.0, + "step": 16189 + }, + { + "epoch": 4.378042184964846, + "grad_norm": 3.3125, + "learning_rate": 0.002721199389956279, + "loss": 2.7166, + "mean_token_accuracy": 0.43451035022735596, + "num_tokens": 8276314764.0, + "step": 16190 + }, + { + "epoch": 4.378312601406165, + "grad_norm": 115.0, + "learning_rate": 0.0027205811689639463, + "loss": 5.3948, + "mean_token_accuracy": 0.2402326464653015, + "num_tokens": 8276838975.0, + "step": 16191 + }, + { + "epoch": 4.378583017847485, + "grad_norm": 7.125, + "learning_rate": 0.0027199632020085307, + "loss": 2.8218, + "mean_token_accuracy": 0.4380999207496643, + "num_tokens": 8277363181.0, + "step": 16192 + }, + { + "epoch": 4.378853434288804, + "grad_norm": 4.375, + "learning_rate": 0.002719345489108995, + "loss": 2.7079, + "mean_token_accuracy": 0.42333489656448364, + "num_tokens": 8277887365.0, + "step": 16193 + }, + { + "epoch": 4.3791238507301244, + "grad_norm": 3.65625, + "learning_rate": 0.002718728030284292, + "loss": 2.7737, + "mean_token_accuracy": 0.4363635182380676, + "num_tokens": 8278411546.0, + "step": 16194 + }, + { + "epoch": 4.379394267171444, + "grad_norm": 3.0625, + "learning_rate": 0.002718110825553364, + "loss": 2.9173, + "mean_token_accuracy": 0.4310222268104553, + "num_tokens": 8278892957.0, + "step": 16195 + }, + { + "epoch": 4.379664683612764, + "grad_norm": 4.0625, + "learning_rate": 0.002717493874935154, + "loss": 2.7468, + "mean_token_accuracy": 0.4478920102119446, + "num_tokens": 8279417166.0, + "step": 16196 + }, + { + "epoch": 4.379935100054083, + "grad_norm": 3.640625, + "learning_rate": 0.002716877178448588, + "loss": 2.604, + "mean_token_accuracy": 0.4532041549682617, + "num_tokens": 8279941389.0, + "step": 16197 + }, + { + "epoch": 4.380205516495403, + "grad_norm": 3.78125, + "learning_rate": 0.00271626073611259, + "loss": 2.8738, + "mean_token_accuracy": 0.4343199133872986, + "num_tokens": 8280465563.0, + "step": 16198 + }, + { + "epoch": 4.380475932936722, + "grad_norm": 3.734375, + "learning_rate": 0.0027156445479460723, + "loss": 2.8715, + "mean_token_accuracy": 0.41959065198898315, + "num_tokens": 8280989716.0, + "step": 16199 + }, + { + "epoch": 4.380746349378042, + "grad_norm": 3.609375, + "learning_rate": 0.002715028613967944, + "loss": 2.9944, + "mean_token_accuracy": 0.4150256812572479, + "num_tokens": 8281513849.0, + "step": 16200 + }, + { + "epoch": 4.3810167658193615, + "grad_norm": 3.765625, + "learning_rate": 0.0027144129341971, + "loss": 2.9042, + "mean_token_accuracy": 0.4328746497631073, + "num_tokens": 8281979198.0, + "step": 16201 + }, + { + "epoch": 4.381287182260682, + "grad_norm": 3.40625, + "learning_rate": 0.0027137975086524357, + "loss": 2.7763, + "mean_token_accuracy": 0.43006432056427, + "num_tokens": 8282503358.0, + "step": 16202 + }, + { + "epoch": 4.381557598702001, + "grad_norm": 3.21875, + "learning_rate": 0.0027131823373528325, + "loss": 2.8344, + "mean_token_accuracy": 0.4405997097492218, + "num_tokens": 8283027548.0, + "step": 16203 + }, + { + "epoch": 4.381828015143321, + "grad_norm": 3.875, + "learning_rate": 0.002712567420317163, + "loss": 2.91, + "mean_token_accuracy": 0.4123982787132263, + "num_tokens": 8283551740.0, + "step": 16204 + }, + { + "epoch": 4.38209843158464, + "grad_norm": 3.625, + "learning_rate": 0.0027119527575642987, + "loss": 2.8407, + "mean_token_accuracy": 0.44174903631210327, + "num_tokens": 8284076018.0, + "step": 16205 + }, + { + "epoch": 4.38236884802596, + "grad_norm": 3.96875, + "learning_rate": 0.002711338349113097, + "loss": 2.7627, + "mean_token_accuracy": 0.44952547550201416, + "num_tokens": 8284600298.0, + "step": 16206 + }, + { + "epoch": 4.382639264467279, + "grad_norm": 3.921875, + "learning_rate": 0.0027107241949824086, + "loss": 2.8674, + "mean_token_accuracy": 0.4322633743286133, + "num_tokens": 8285124579.0, + "step": 16207 + }, + { + "epoch": 4.3829096809085994, + "grad_norm": 4.90625, + "learning_rate": 0.0027101102951910807, + "loss": 2.9816, + "mean_token_accuracy": 0.4466671347618103, + "num_tokens": 8285590859.0, + "step": 16208 + }, + { + "epoch": 4.383180097349919, + "grad_norm": 4.40625, + "learning_rate": 0.002709496649757948, + "loss": 2.8265, + "mean_token_accuracy": 0.4548864960670471, + "num_tokens": 8286115113.0, + "step": 16209 + }, + { + "epoch": 4.383450513791239, + "grad_norm": 3.953125, + "learning_rate": 0.0027088832587018373, + "loss": 2.5971, + "mean_token_accuracy": 0.44610458612442017, + "num_tokens": 8286639355.0, + "step": 16210 + }, + { + "epoch": 4.383720930232558, + "grad_norm": 80.5, + "learning_rate": 0.002708270122041573, + "loss": 7.8781, + "mean_token_accuracy": 0.09763413667678833, + "num_tokens": 8287163622.0, + "step": 16211 + }, + { + "epoch": 4.383991346673878, + "grad_norm": 6.78125, + "learning_rate": 0.0027076572397959626, + "loss": 2.9466, + "mean_token_accuracy": 0.4235902726650238, + "num_tokens": 8287687887.0, + "step": 16212 + }, + { + "epoch": 4.384261763115197, + "grad_norm": 2.78125, + "learning_rate": 0.0027070446119838157, + "loss": 2.7566, + "mean_token_accuracy": 0.4383903741836548, + "num_tokens": 8288212082.0, + "step": 16213 + }, + { + "epoch": 4.384532179556517, + "grad_norm": 3.21875, + "learning_rate": 0.002706432238623929, + "loss": 2.8716, + "mean_token_accuracy": 0.436711847782135, + "num_tokens": 8288685568.0, + "step": 16214 + }, + { + "epoch": 4.3848025959978365, + "grad_norm": 3.828125, + "learning_rate": 0.0027058201197350897, + "loss": 2.7082, + "mean_token_accuracy": 0.4360613226890564, + "num_tokens": 8289209792.0, + "step": 16215 + }, + { + "epoch": 4.385073012439157, + "grad_norm": 3.390625, + "learning_rate": 0.0027052082553360814, + "loss": 2.7844, + "mean_token_accuracy": 0.43437886238098145, + "num_tokens": 8289734063.0, + "step": 16216 + }, + { + "epoch": 4.385343428880476, + "grad_norm": 3.671875, + "learning_rate": 0.0027045966454456783, + "loss": 2.8147, + "mean_token_accuracy": 0.435899019241333, + "num_tokens": 8290241688.0, + "step": 16217 + }, + { + "epoch": 4.385613845321796, + "grad_norm": 3.453125, + "learning_rate": 0.002703985290082644, + "loss": 2.7987, + "mean_token_accuracy": 0.4219896197319031, + "num_tokens": 8290765861.0, + "step": 16218 + }, + { + "epoch": 4.385884261763115, + "grad_norm": 3.75, + "learning_rate": 0.0027033741892657395, + "loss": 2.6232, + "mean_token_accuracy": 0.4498560428619385, + "num_tokens": 8291289996.0, + "step": 16219 + }, + { + "epoch": 4.386154678204435, + "grad_norm": 3.671875, + "learning_rate": 0.002702763343013713, + "loss": 2.6951, + "mean_token_accuracy": 0.4416341781616211, + "num_tokens": 8291814031.0, + "step": 16220 + }, + { + "epoch": 4.386425094645754, + "grad_norm": 3.59375, + "learning_rate": 0.002702152751345305, + "loss": 2.8374, + "mean_token_accuracy": 0.43044281005859375, + "num_tokens": 8292338180.0, + "step": 16221 + }, + { + "epoch": 4.3866955110870745, + "grad_norm": 3.578125, + "learning_rate": 0.0027015424142792567, + "loss": 2.8944, + "mean_token_accuracy": 0.42977282404899597, + "num_tokens": 8292862411.0, + "step": 16222 + }, + { + "epoch": 4.386965927528394, + "grad_norm": 4.25, + "learning_rate": 0.0027009323318342906, + "loss": 2.7706, + "mean_token_accuracy": 0.46114271879196167, + "num_tokens": 8293328757.0, + "step": 16223 + }, + { + "epoch": 4.387236343969714, + "grad_norm": 3.359375, + "learning_rate": 0.0027003225040291247, + "loss": 2.7544, + "mean_token_accuracy": 0.4320651888847351, + "num_tokens": 8293806157.0, + "step": 16224 + }, + { + "epoch": 4.387506760411033, + "grad_norm": 3.78125, + "learning_rate": 0.0026997129308824735, + "loss": 2.773, + "mean_token_accuracy": 0.45581626892089844, + "num_tokens": 8294306810.0, + "step": 16225 + }, + { + "epoch": 4.387777176852353, + "grad_norm": 3.5625, + "learning_rate": 0.0026991036124130374, + "loss": 2.7446, + "mean_token_accuracy": 0.446826696395874, + "num_tokens": 8294831086.0, + "step": 16226 + }, + { + "epoch": 4.388047593293672, + "grad_norm": 4.65625, + "learning_rate": 0.0026984945486395167, + "loss": 2.731, + "mean_token_accuracy": 0.4119003415107727, + "num_tokens": 8295355274.0, + "step": 16227 + }, + { + "epoch": 4.388318009734991, + "grad_norm": 3.765625, + "learning_rate": 0.0026978857395805935, + "loss": 2.5672, + "mean_token_accuracy": 0.43544912338256836, + "num_tokens": 8295879471.0, + "step": 16228 + }, + { + "epoch": 4.3885884261763115, + "grad_norm": 3.546875, + "learning_rate": 0.0026972771852549525, + "loss": 2.8319, + "mean_token_accuracy": 0.44165825843811035, + "num_tokens": 8296403726.0, + "step": 16229 + }, + { + "epoch": 4.388858842617632, + "grad_norm": 4.3125, + "learning_rate": 0.0026966688856812637, + "loss": 2.8137, + "mean_token_accuracy": 0.4286981523036957, + "num_tokens": 8296918300.0, + "step": 16230 + }, + { + "epoch": 4.389129259058951, + "grad_norm": 48.5, + "learning_rate": 0.0026960608408781927, + "loss": 4.5961, + "mean_token_accuracy": 0.28348904848098755, + "num_tokens": 8297401492.0, + "step": 16231 + }, + { + "epoch": 4.38939967550027, + "grad_norm": 5.59375, + "learning_rate": 0.0026954530508643947, + "loss": 3.0507, + "mean_token_accuracy": 0.41938820481300354, + "num_tokens": 8297925759.0, + "step": 16232 + }, + { + "epoch": 4.38967009194159, + "grad_norm": 3.1875, + "learning_rate": 0.002694845515658519, + "loss": 2.7115, + "mean_token_accuracy": 0.4624529480934143, + "num_tokens": 8298450044.0, + "step": 16233 + }, + { + "epoch": 4.389940508382909, + "grad_norm": 3.109375, + "learning_rate": 0.002694238235279207, + "loss": 2.7469, + "mean_token_accuracy": 0.4504620134830475, + "num_tokens": 8298941647.0, + "step": 16234 + }, + { + "epoch": 4.390210924824229, + "grad_norm": 3.21875, + "learning_rate": 0.0026936312097450934, + "loss": 2.9544, + "mean_token_accuracy": 0.43086814880371094, + "num_tokens": 8299465842.0, + "step": 16235 + }, + { + "epoch": 4.390481341265549, + "grad_norm": 3.265625, + "learning_rate": 0.0026930244390747987, + "loss": 2.7865, + "mean_token_accuracy": 0.42291730642318726, + "num_tokens": 8299990115.0, + "step": 16236 + }, + { + "epoch": 4.390751757706869, + "grad_norm": 3.453125, + "learning_rate": 0.0026924179232869455, + "loss": 2.6846, + "mean_token_accuracy": 0.45328930020332336, + "num_tokens": 8300482428.0, + "step": 16237 + }, + { + "epoch": 4.391022174148188, + "grad_norm": 3.578125, + "learning_rate": 0.00269181166240014, + "loss": 3.0228, + "mean_token_accuracy": 0.4213225245475769, + "num_tokens": 8301006642.0, + "step": 16238 + }, + { + "epoch": 4.391292590589508, + "grad_norm": 3.921875, + "learning_rate": 0.002691205656432986, + "loss": 2.8774, + "mean_token_accuracy": 0.43020230531692505, + "num_tokens": 8301530885.0, + "step": 16239 + }, + { + "epoch": 4.391563007030827, + "grad_norm": 3.71875, + "learning_rate": 0.002690599905404076, + "loss": 2.6734, + "mean_token_accuracy": 0.4427937865257263, + "num_tokens": 8302055167.0, + "step": 16240 + }, + { + "epoch": 4.391833423472147, + "grad_norm": 3.875, + "learning_rate": 0.0026899944093319995, + "loss": 2.8605, + "mean_token_accuracy": 0.40965765714645386, + "num_tokens": 8302579418.0, + "step": 16241 + }, + { + "epoch": 4.392103839913466, + "grad_norm": 3.421875, + "learning_rate": 0.00268938916823533, + "loss": 2.6734, + "mean_token_accuracy": 0.44755393266677856, + "num_tokens": 8303103551.0, + "step": 16242 + }, + { + "epoch": 4.3923742563547865, + "grad_norm": 3.71875, + "learning_rate": 0.0026887841821326432, + "loss": 2.6659, + "mean_token_accuracy": 0.43914544582366943, + "num_tokens": 8303627812.0, + "step": 16243 + }, + { + "epoch": 4.392644672796106, + "grad_norm": 3.71875, + "learning_rate": 0.002688179451042498, + "loss": 2.6821, + "mean_token_accuracy": 0.4397912919521332, + "num_tokens": 8304152021.0, + "step": 16244 + }, + { + "epoch": 4.392915089237426, + "grad_norm": 3.546875, + "learning_rate": 0.0026875749749834515, + "loss": 2.7221, + "mean_token_accuracy": 0.45495182275772095, + "num_tokens": 8304676277.0, + "step": 16245 + }, + { + "epoch": 4.393185505678745, + "grad_norm": 4.125, + "learning_rate": 0.0026869707539740506, + "loss": 2.9109, + "mean_token_accuracy": 0.41152286529541016, + "num_tokens": 8305200455.0, + "step": 16246 + }, + { + "epoch": 4.393455922120065, + "grad_norm": 3.59375, + "learning_rate": 0.0026863667880328325, + "loss": 2.7659, + "mean_token_accuracy": 0.45776551961898804, + "num_tokens": 8305684907.0, + "step": 16247 + }, + { + "epoch": 4.393726338561384, + "grad_norm": 4.5625, + "learning_rate": 0.0026857630771783304, + "loss": 2.885, + "mean_token_accuracy": 0.44452524185180664, + "num_tokens": 8306148172.0, + "step": 16248 + }, + { + "epoch": 4.393996755002704, + "grad_norm": 4.25, + "learning_rate": 0.0026851596214290687, + "loss": 2.7751, + "mean_token_accuracy": 0.4412524998188019, + "num_tokens": 8306645609.0, + "step": 16249 + }, + { + "epoch": 4.394267171444024, + "grad_norm": 3.109375, + "learning_rate": 0.00268455642080356, + "loss": 2.8235, + "mean_token_accuracy": 0.4327937364578247, + "num_tokens": 8307169852.0, + "step": 16250 + }, + { + "epoch": 4.394537587885344, + "grad_norm": 41.5, + "learning_rate": 0.002683953475320317, + "loss": 3.7193, + "mean_token_accuracy": 0.3262800872325897, + "num_tokens": 8307624783.0, + "step": 16251 + }, + { + "epoch": 4.394808004326663, + "grad_norm": 8.5625, + "learning_rate": 0.0026833507849978373, + "loss": 2.9427, + "mean_token_accuracy": 0.44197380542755127, + "num_tokens": 8308108537.0, + "step": 16252 + }, + { + "epoch": 4.395078420767983, + "grad_norm": 3.546875, + "learning_rate": 0.0026827483498546114, + "loss": 2.9327, + "mean_token_accuracy": 0.43077829480171204, + "num_tokens": 8308632799.0, + "step": 16253 + }, + { + "epoch": 4.395348837209302, + "grad_norm": 3.53125, + "learning_rate": 0.0026821461699091265, + "loss": 2.7703, + "mean_token_accuracy": 0.45132172107696533, + "num_tokens": 8309156944.0, + "step": 16254 + }, + { + "epoch": 4.395619253650622, + "grad_norm": 4.25, + "learning_rate": 0.002681544245179857, + "loss": 2.725, + "mean_token_accuracy": 0.4442336857318878, + "num_tokens": 8309656338.0, + "step": 16255 + }, + { + "epoch": 4.395889670091941, + "grad_norm": 16.5, + "learning_rate": 0.002680942575685273, + "loss": 2.4908, + "mean_token_accuracy": 0.4819096028804779, + "num_tokens": 8310180470.0, + "step": 16256 + }, + { + "epoch": 4.3961600865332615, + "grad_norm": 4.59375, + "learning_rate": 0.0026803411614438376, + "loss": 2.9324, + "mean_token_accuracy": 0.41512221097946167, + "num_tokens": 8310704736.0, + "step": 16257 + }, + { + "epoch": 4.396430502974581, + "grad_norm": 4.1875, + "learning_rate": 0.0026797400024740022, + "loss": 2.8602, + "mean_token_accuracy": 0.435776948928833, + "num_tokens": 8311225365.0, + "step": 16258 + }, + { + "epoch": 4.396700919415901, + "grad_norm": 3.65625, + "learning_rate": 0.002679139098794209, + "loss": 2.7906, + "mean_token_accuracy": 0.4474433660507202, + "num_tokens": 8311749635.0, + "step": 16259 + }, + { + "epoch": 4.39697133585722, + "grad_norm": 4.625, + "learning_rate": 0.0026785384504229006, + "loss": 2.7186, + "mean_token_accuracy": 0.46275702118873596, + "num_tokens": 8312201421.0, + "step": 16260 + }, + { + "epoch": 4.39724175229854, + "grad_norm": 3.671875, + "learning_rate": 0.002677938057378502, + "loss": 2.7281, + "mean_token_accuracy": 0.4478234648704529, + "num_tokens": 8312725704.0, + "step": 16261 + }, + { + "epoch": 4.397512168739859, + "grad_norm": 4.125, + "learning_rate": 0.00267733791967944, + "loss": 2.9179, + "mean_token_accuracy": 0.42969146370887756, + "num_tokens": 8313249984.0, + "step": 16262 + }, + { + "epoch": 4.397782585181179, + "grad_norm": 3.78125, + "learning_rate": 0.0026767380373441253, + "loss": 2.9322, + "mean_token_accuracy": 0.42853012681007385, + "num_tokens": 8313765237.0, + "step": 16263 + }, + { + "epoch": 4.398053001622499, + "grad_norm": 4.21875, + "learning_rate": 0.0026761384103909636, + "loss": 2.8519, + "mean_token_accuracy": 0.4357980191707611, + "num_tokens": 8314246075.0, + "step": 16264 + }, + { + "epoch": 4.398323418063819, + "grad_norm": 3.125, + "learning_rate": 0.002675539038838356, + "loss": 2.693, + "mean_token_accuracy": 0.44354552030563354, + "num_tokens": 8314770321.0, + "step": 16265 + }, + { + "epoch": 4.398593834505138, + "grad_norm": 4.34375, + "learning_rate": 0.002674939922704691, + "loss": 2.9926, + "mean_token_accuracy": 0.41824424266815186, + "num_tokens": 8315294464.0, + "step": 16266 + }, + { + "epoch": 4.398864250946458, + "grad_norm": 3.484375, + "learning_rate": 0.002674341062008351, + "loss": 2.4354, + "mean_token_accuracy": 0.513087272644043, + "num_tokens": 8315818518.0, + "step": 16267 + }, + { + "epoch": 4.399134667387777, + "grad_norm": 2.828125, + "learning_rate": 0.0026737424567677115, + "loss": 2.8889, + "mean_token_accuracy": 0.4286423325538635, + "num_tokens": 8316342721.0, + "step": 16268 + }, + { + "epoch": 4.399405083829096, + "grad_norm": 3.765625, + "learning_rate": 0.0026731441070011377, + "loss": 2.8136, + "mean_token_accuracy": 0.45083749294281006, + "num_tokens": 8316790045.0, + "step": 16269 + }, + { + "epoch": 4.399675500270416, + "grad_norm": 4.40625, + "learning_rate": 0.002672546012726992, + "loss": 2.9009, + "mean_token_accuracy": 0.43474704027175903, + "num_tokens": 8317314236.0, + "step": 16270 + }, + { + "epoch": 4.3999459167117365, + "grad_norm": 50.0, + "learning_rate": 0.0026719481739636247, + "loss": 3.7076, + "mean_token_accuracy": 0.3941137194633484, + "num_tokens": 8317838512.0, + "step": 16271 + }, + { + "epoch": 4.400216333153056, + "grad_norm": 7.3125, + "learning_rate": 0.0026713505907293785, + "loss": 2.5836, + "mean_token_accuracy": 0.4228050112724304, + "num_tokens": 8318362724.0, + "step": 16272 + }, + { + "epoch": 4.400486749594375, + "grad_norm": 3.484375, + "learning_rate": 0.002670753263042587, + "loss": 2.7419, + "mean_token_accuracy": 0.4502691328525543, + "num_tokens": 8318859058.0, + "step": 16273 + }, + { + "epoch": 4.400757166035695, + "grad_norm": 3.59375, + "learning_rate": 0.002670156190921583, + "loss": 2.9406, + "mean_token_accuracy": 0.4285147786140442, + "num_tokens": 8319383330.0, + "step": 16274 + }, + { + "epoch": 4.401027582477014, + "grad_norm": 4.09375, + "learning_rate": 0.0026695593743846823, + "loss": 2.8688, + "mean_token_accuracy": 0.44518065452575684, + "num_tokens": 8319846142.0, + "step": 16275 + }, + { + "epoch": 4.401297998918334, + "grad_norm": 4.28125, + "learning_rate": 0.0026689628134501982, + "loss": 2.8298, + "mean_token_accuracy": 0.46531766653060913, + "num_tokens": 8320308764.0, + "step": 16276 + }, + { + "epoch": 4.4015684153596535, + "grad_norm": 3.9375, + "learning_rate": 0.002668366508136435, + "loss": 2.7236, + "mean_token_accuracy": 0.4254535138607025, + "num_tokens": 8320831781.0, + "step": 16277 + }, + { + "epoch": 4.401838831800974, + "grad_norm": 3.0, + "learning_rate": 0.0026677704584616903, + "loss": 2.7412, + "mean_token_accuracy": 0.43123793601989746, + "num_tokens": 8321356011.0, + "step": 16278 + }, + { + "epoch": 4.402109248242293, + "grad_norm": 6.0, + "learning_rate": 0.0026671746644442496, + "loss": 2.5646, + "mean_token_accuracy": 0.4721992611885071, + "num_tokens": 8321854001.0, + "step": 16279 + }, + { + "epoch": 4.402379664683613, + "grad_norm": 2.765625, + "learning_rate": 0.0026665791261023964, + "loss": 2.7966, + "mean_token_accuracy": 0.43430572748184204, + "num_tokens": 8322378276.0, + "step": 16280 + }, + { + "epoch": 4.402650081124932, + "grad_norm": 3.515625, + "learning_rate": 0.0026659838434544025, + "loss": 2.8533, + "mean_token_accuracy": 0.4565770626068115, + "num_tokens": 8322902552.0, + "step": 16281 + }, + { + "epoch": 4.402920497566252, + "grad_norm": 3.53125, + "learning_rate": 0.002665388816518532, + "loss": 2.8913, + "mean_token_accuracy": 0.43126609921455383, + "num_tokens": 8323426833.0, + "step": 16282 + }, + { + "epoch": 4.403190914007571, + "grad_norm": 3.34375, + "learning_rate": 0.0026647940453130427, + "loss": 2.8492, + "mean_token_accuracy": 0.42473238706588745, + "num_tokens": 8323951081.0, + "step": 16283 + }, + { + "epoch": 4.403461330448891, + "grad_norm": 3.125, + "learning_rate": 0.002664199529856186, + "loss": 2.715, + "mean_token_accuracy": 0.43216538429260254, + "num_tokens": 8324475208.0, + "step": 16284 + }, + { + "epoch": 4.403731746890211, + "grad_norm": 3.140625, + "learning_rate": 0.0026636052701662005, + "loss": 2.9631, + "mean_token_accuracy": 0.41635605692863464, + "num_tokens": 8324999424.0, + "step": 16285 + }, + { + "epoch": 4.404002163331531, + "grad_norm": 3.296875, + "learning_rate": 0.002663011266261322, + "loss": 2.7199, + "mean_token_accuracy": 0.45517244935035706, + "num_tokens": 8325523678.0, + "step": 16286 + }, + { + "epoch": 4.40427257977285, + "grad_norm": 3.0, + "learning_rate": 0.0026624175181597745, + "loss": 2.7496, + "mean_token_accuracy": 0.4514487087726593, + "num_tokens": 8326047820.0, + "step": 16287 + }, + { + "epoch": 4.40454299621417, + "grad_norm": 3.125, + "learning_rate": 0.0026618240258797776, + "loss": 2.6655, + "mean_token_accuracy": 0.4473225772380829, + "num_tokens": 8326571938.0, + "step": 16288 + }, + { + "epoch": 4.404813412655489, + "grad_norm": 3.296875, + "learning_rate": 0.0026612307894395416, + "loss": 2.8712, + "mean_token_accuracy": 0.4366336464881897, + "num_tokens": 8327096124.0, + "step": 16289 + }, + { + "epoch": 4.405083829096809, + "grad_norm": 3.65625, + "learning_rate": 0.0026606378088572654, + "loss": 2.7183, + "mean_token_accuracy": 0.44304049015045166, + "num_tokens": 8327620392.0, + "step": 16290 + }, + { + "epoch": 4.4053542455381285, + "grad_norm": 70.0, + "learning_rate": 0.002660045084151147, + "loss": 4.6489, + "mean_token_accuracy": 0.2663443088531494, + "num_tokens": 8328128189.0, + "step": 16291 + }, + { + "epoch": 4.405624661979449, + "grad_norm": 7.75, + "learning_rate": 0.0026594526153393716, + "loss": 2.7959, + "mean_token_accuracy": 0.44116443395614624, + "num_tokens": 8328652444.0, + "step": 16292 + }, + { + "epoch": 4.405895078420768, + "grad_norm": 36.0, + "learning_rate": 0.0026588604024401176, + "loss": 2.6237, + "mean_token_accuracy": 0.4711514711380005, + "num_tokens": 8329176594.0, + "step": 16293 + }, + { + "epoch": 4.406165494862088, + "grad_norm": 5.0625, + "learning_rate": 0.002658268445471556, + "loss": 2.8523, + "mean_token_accuracy": 0.4274671673774719, + "num_tokens": 8329678000.0, + "step": 16294 + }, + { + "epoch": 4.406435911303407, + "grad_norm": 3.109375, + "learning_rate": 0.002657676744451851, + "loss": 3.0914, + "mean_token_accuracy": 0.41480159759521484, + "num_tokens": 8330202274.0, + "step": 16295 + }, + { + "epoch": 4.406706327744727, + "grad_norm": 4.21875, + "learning_rate": 0.0026570852993991543, + "loss": 3.001, + "mean_token_accuracy": 0.4163227081298828, + "num_tokens": 8330726507.0, + "step": 16296 + }, + { + "epoch": 4.406976744186046, + "grad_norm": 3.859375, + "learning_rate": 0.0026564941103316164, + "loss": 2.9079, + "mean_token_accuracy": 0.43342798948287964, + "num_tokens": 8331250640.0, + "step": 16297 + }, + { + "epoch": 4.407247160627366, + "grad_norm": 4.03125, + "learning_rate": 0.002655903177267378, + "loss": 2.8646, + "mean_token_accuracy": 0.4273810386657715, + "num_tokens": 8331773343.0, + "step": 16298 + }, + { + "epoch": 4.407517577068686, + "grad_norm": 3.4375, + "learning_rate": 0.002655312500224565, + "loss": 2.7816, + "mean_token_accuracy": 0.46101781725883484, + "num_tokens": 8332271271.0, + "step": 16299 + }, + { + "epoch": 4.407787993510006, + "grad_norm": 3.78125, + "learning_rate": 0.002654722079221308, + "loss": 2.7227, + "mean_token_accuracy": 0.411515474319458, + "num_tokens": 8332795514.0, + "step": 16300 + }, + { + "epoch": 4.408058409951325, + "grad_norm": 3.5, + "learning_rate": 0.002654131914275718, + "loss": 2.7792, + "mean_token_accuracy": 0.43537917733192444, + "num_tokens": 8333319666.0, + "step": 16301 + }, + { + "epoch": 4.408328826392645, + "grad_norm": 4.125, + "learning_rate": 0.0026535420054059045, + "loss": 2.6735, + "mean_token_accuracy": 0.4607703685760498, + "num_tokens": 8333781681.0, + "step": 16302 + }, + { + "epoch": 4.408599242833964, + "grad_norm": 3.78125, + "learning_rate": 0.0026529523526299683, + "loss": 2.7143, + "mean_token_accuracy": 0.44403883814811707, + "num_tokens": 8334305871.0, + "step": 16303 + }, + { + "epoch": 4.408869659275284, + "grad_norm": 3.703125, + "learning_rate": 0.002652362955966, + "loss": 2.8166, + "mean_token_accuracy": 0.4457429349422455, + "num_tokens": 8334830111.0, + "step": 16304 + }, + { + "epoch": 4.4091400757166035, + "grad_norm": 4.15625, + "learning_rate": 0.0026517738154320848, + "loss": 2.8962, + "mean_token_accuracy": 0.4468168616294861, + "num_tokens": 8335298516.0, + "step": 16305 + }, + { + "epoch": 4.409410492157924, + "grad_norm": 4.375, + "learning_rate": 0.002651184931046301, + "loss": 2.6628, + "mean_token_accuracy": 0.4294792115688324, + "num_tokens": 8335822711.0, + "step": 16306 + }, + { + "epoch": 4.409680908599243, + "grad_norm": 5.0625, + "learning_rate": 0.002650596302826713, + "loss": 2.8764, + "mean_token_accuracy": 0.44312843680381775, + "num_tokens": 8336326688.0, + "step": 16307 + }, + { + "epoch": 4.409951325040563, + "grad_norm": 3.40625, + "learning_rate": 0.0026500079307913878, + "loss": 2.7438, + "mean_token_accuracy": 0.4375975430011749, + "num_tokens": 8336850863.0, + "step": 16308 + }, + { + "epoch": 4.410221741481882, + "grad_norm": 5.375, + "learning_rate": 0.0026494198149583743, + "loss": 2.5353, + "mean_token_accuracy": 0.4520975351333618, + "num_tokens": 8337280115.0, + "step": 16309 + }, + { + "epoch": 4.410492157923201, + "grad_norm": 3.4375, + "learning_rate": 0.0026488319553457157, + "loss": 2.8057, + "mean_token_accuracy": 0.4507763981819153, + "num_tokens": 8337804396.0, + "step": 16310 + }, + { + "epoch": 4.410762574364521, + "grad_norm": 39.5, + "learning_rate": 0.0026482443519714543, + "loss": 4.128, + "mean_token_accuracy": 0.33857980370521545, + "num_tokens": 8338328662.0, + "step": 16311 + }, + { + "epoch": 4.411032990805841, + "grad_norm": 11.125, + "learning_rate": 0.0026476570048536156, + "loss": 3.0983, + "mean_token_accuracy": 0.4076753258705139, + "num_tokens": 8338852926.0, + "step": 16312 + }, + { + "epoch": 4.411303407247161, + "grad_norm": 6.21875, + "learning_rate": 0.0026470699140102223, + "loss": 2.7598, + "mean_token_accuracy": 0.4483839273452759, + "num_tokens": 8339322172.0, + "step": 16313 + }, + { + "epoch": 4.41157382368848, + "grad_norm": 2.703125, + "learning_rate": 0.0026464830794592905, + "loss": 2.8065, + "mean_token_accuracy": 0.4414899945259094, + "num_tokens": 8339846074.0, + "step": 16314 + }, + { + "epoch": 4.4118442401298, + "grad_norm": 2.828125, + "learning_rate": 0.0026458965012188243, + "loss": 2.9362, + "mean_token_accuracy": 0.4299905002117157, + "num_tokens": 8340370345.0, + "step": 16315 + }, + { + "epoch": 4.412114656571119, + "grad_norm": 3.84375, + "learning_rate": 0.0026453101793068186, + "loss": 2.7347, + "mean_token_accuracy": 0.44290825724601746, + "num_tokens": 8340894504.0, + "step": 16316 + }, + { + "epoch": 4.412385073012439, + "grad_norm": 4.875, + "learning_rate": 0.002644724113741268, + "loss": 2.9174, + "mean_token_accuracy": 0.43367528915405273, + "num_tokens": 8341365553.0, + "step": 16317 + }, + { + "epoch": 4.412655489453758, + "grad_norm": 3.734375, + "learning_rate": 0.002644138304540153, + "loss": 2.5938, + "mean_token_accuracy": 0.43157583475112915, + "num_tokens": 8341839302.0, + "step": 16318 + }, + { + "epoch": 4.4129259058950785, + "grad_norm": 3.15625, + "learning_rate": 0.0026435527517214463, + "loss": 2.8353, + "mean_token_accuracy": 0.44311660528182983, + "num_tokens": 8342351857.0, + "step": 16319 + }, + { + "epoch": 4.413196322336398, + "grad_norm": 3.515625, + "learning_rate": 0.0026429674553031172, + "loss": 2.6793, + "mean_token_accuracy": 0.47231751680374146, + "num_tokens": 8342847343.0, + "step": 16320 + }, + { + "epoch": 4.413466738777718, + "grad_norm": 4.15625, + "learning_rate": 0.0026423824153031235, + "loss": 2.8359, + "mean_token_accuracy": 0.43955910205841064, + "num_tokens": 8343371605.0, + "step": 16321 + }, + { + "epoch": 4.413737155219037, + "grad_norm": 3.921875, + "learning_rate": 0.0026417976317394134, + "loss": 2.9537, + "mean_token_accuracy": 0.4204835295677185, + "num_tokens": 8343895883.0, + "step": 16322 + }, + { + "epoch": 4.414007571660357, + "grad_norm": 3.765625, + "learning_rate": 0.0026412131046299335, + "loss": 2.716, + "mean_token_accuracy": 0.452515572309494, + "num_tokens": 8344420067.0, + "step": 16323 + }, + { + "epoch": 4.414277988101676, + "grad_norm": 3.234375, + "learning_rate": 0.0026406288339926175, + "loss": 2.8399, + "mean_token_accuracy": 0.4328537583351135, + "num_tokens": 8344944330.0, + "step": 16324 + }, + { + "epoch": 4.414548404542996, + "grad_norm": 3.8125, + "learning_rate": 0.00264004481984539, + "loss": 2.7745, + "mean_token_accuracy": 0.45828497409820557, + "num_tokens": 8345468598.0, + "step": 16325 + }, + { + "epoch": 4.4148188209843156, + "grad_norm": 3.109375, + "learning_rate": 0.0026394610622061726, + "loss": 2.6223, + "mean_token_accuracy": 0.4270598888397217, + "num_tokens": 8345992817.0, + "step": 16326 + }, + { + "epoch": 4.415089237425636, + "grad_norm": 3.578125, + "learning_rate": 0.0026388775610928787, + "loss": 2.9247, + "mean_token_accuracy": 0.4301878809928894, + "num_tokens": 8346517077.0, + "step": 16327 + }, + { + "epoch": 4.415359653866955, + "grad_norm": 4.5, + "learning_rate": 0.002638294316523408, + "loss": 2.7849, + "mean_token_accuracy": 0.4483664631843567, + "num_tokens": 8347033416.0, + "step": 16328 + }, + { + "epoch": 4.415630070308275, + "grad_norm": 4.1875, + "learning_rate": 0.002637711328515659, + "loss": 2.7999, + "mean_token_accuracy": 0.43609994649887085, + "num_tokens": 8347557656.0, + "step": 16329 + }, + { + "epoch": 4.415900486749594, + "grad_norm": 4.09375, + "learning_rate": 0.002637128597087518, + "loss": 2.6775, + "mean_token_accuracy": 0.45103752613067627, + "num_tokens": 8348051646.0, + "step": 16330 + }, + { + "epoch": 4.416170903190914, + "grad_norm": 73.0, + "learning_rate": 0.0026365461222568644, + "loss": 4.3737, + "mean_token_accuracy": 0.36309289932250977, + "num_tokens": 8348575897.0, + "step": 16331 + }, + { + "epoch": 4.416441319632233, + "grad_norm": 8.625, + "learning_rate": 0.002635963904041573, + "loss": 2.6481, + "mean_token_accuracy": 0.44032910466194153, + "num_tokens": 8349100010.0, + "step": 16332 + }, + { + "epoch": 4.4167117360735535, + "grad_norm": 3.1875, + "learning_rate": 0.0026353819424595032, + "loss": 2.9117, + "mean_token_accuracy": 0.43153369426727295, + "num_tokens": 8349593547.0, + "step": 16333 + }, + { + "epoch": 4.416982152514873, + "grad_norm": 11.6875, + "learning_rate": 0.0026348002375285165, + "loss": 2.597, + "mean_token_accuracy": 0.5061352252960205, + "num_tokens": 8349993516.0, + "step": 16334 + }, + { + "epoch": 4.417252568956193, + "grad_norm": 5.5625, + "learning_rate": 0.0026342187892664593, + "loss": 3.0487, + "mean_token_accuracy": 0.42038726806640625, + "num_tokens": 8350517711.0, + "step": 16335 + }, + { + "epoch": 4.417522985397512, + "grad_norm": 2.828125, + "learning_rate": 0.00263363759769117, + "loss": 2.9111, + "mean_token_accuracy": 0.43822336196899414, + "num_tokens": 8351041981.0, + "step": 16336 + }, + { + "epoch": 4.417793401838832, + "grad_norm": 3.1875, + "learning_rate": 0.002633056662820484, + "loss": 2.6453, + "mean_token_accuracy": 0.43329229950904846, + "num_tokens": 8351566111.0, + "step": 16337 + }, + { + "epoch": 4.418063818280151, + "grad_norm": 3.390625, + "learning_rate": 0.0026324759846722257, + "loss": 2.7598, + "mean_token_accuracy": 0.4359590411186218, + "num_tokens": 8352090286.0, + "step": 16338 + }, + { + "epoch": 4.418334234721471, + "grad_norm": 3.796875, + "learning_rate": 0.0026318955632642096, + "loss": 2.943, + "mean_token_accuracy": 0.41653794050216675, + "num_tokens": 8352564902.0, + "step": 16339 + }, + { + "epoch": 4.4186046511627906, + "grad_norm": 3.484375, + "learning_rate": 0.002631315398614247, + "loss": 2.8628, + "mean_token_accuracy": 0.43504369258880615, + "num_tokens": 8353089058.0, + "step": 16340 + }, + { + "epoch": 4.418875067604111, + "grad_norm": 3.484375, + "learning_rate": 0.0026307354907401403, + "loss": 2.9573, + "mean_token_accuracy": 0.43648093938827515, + "num_tokens": 8353613334.0, + "step": 16341 + }, + { + "epoch": 4.41914548404543, + "grad_norm": 4.46875, + "learning_rate": 0.0026301558396596785, + "loss": 2.8023, + "mean_token_accuracy": 0.4410778284072876, + "num_tokens": 8354137606.0, + "step": 16342 + }, + { + "epoch": 4.41941590048675, + "grad_norm": 3.859375, + "learning_rate": 0.0026295764453906525, + "loss": 2.8704, + "mean_token_accuracy": 0.4372227191925049, + "num_tokens": 8354661884.0, + "step": 16343 + }, + { + "epoch": 4.419686316928069, + "grad_norm": 4.5, + "learning_rate": 0.002628997307950837, + "loss": 2.7937, + "mean_token_accuracy": 0.4379690885543823, + "num_tokens": 8355186080.0, + "step": 16344 + }, + { + "epoch": 4.419956733369389, + "grad_norm": 3.796875, + "learning_rate": 0.002628418427357999, + "loss": 2.6644, + "mean_token_accuracy": 0.4524715840816498, + "num_tokens": 8355710332.0, + "step": 16345 + }, + { + "epoch": 4.420227149810708, + "grad_norm": 3.75, + "learning_rate": 0.0026278398036299047, + "loss": 2.7099, + "mean_token_accuracy": 0.44451773166656494, + "num_tokens": 8356234548.0, + "step": 16346 + }, + { + "epoch": 4.4204975662520285, + "grad_norm": 3.921875, + "learning_rate": 0.0026272614367843063, + "loss": 2.7336, + "mean_token_accuracy": 0.447757363319397, + "num_tokens": 8356743215.0, + "step": 16347 + }, + { + "epoch": 4.420767982693348, + "grad_norm": 3.375, + "learning_rate": 0.0026266833268389486, + "loss": 2.7257, + "mean_token_accuracy": 0.44120657444000244, + "num_tokens": 8357267427.0, + "step": 16348 + }, + { + "epoch": 4.421038399134668, + "grad_norm": 3.875, + "learning_rate": 0.002626105473811572, + "loss": 2.7983, + "mean_token_accuracy": 0.4284393787384033, + "num_tokens": 8357791640.0, + "step": 16349 + }, + { + "epoch": 4.421308815575987, + "grad_norm": 3.59375, + "learning_rate": 0.002625527877719905, + "loss": 2.927, + "mean_token_accuracy": 0.44323885440826416, + "num_tokens": 8358288475.0, + "step": 16350 + }, + { + "epoch": 4.421579232017306, + "grad_norm": 24.25, + "learning_rate": 0.0026249505385816697, + "loss": 2.8764, + "mean_token_accuracy": 0.4352581799030304, + "num_tokens": 8358812646.0, + "step": 16351 + }, + { + "epoch": 4.421849648458626, + "grad_norm": 7.96875, + "learning_rate": 0.002624373456414582, + "loss": 2.842, + "mean_token_accuracy": 0.4428056478500366, + "num_tokens": 8359336915.0, + "step": 16352 + }, + { + "epoch": 4.422120064899946, + "grad_norm": 3.0625, + "learning_rate": 0.002623796631236347, + "loss": 2.8238, + "mean_token_accuracy": 0.42523306608200073, + "num_tokens": 8359861091.0, + "step": 16353 + }, + { + "epoch": 4.4223904813412656, + "grad_norm": 2.875, + "learning_rate": 0.0026232200630646645, + "loss": 2.8325, + "mean_token_accuracy": 0.44273442029953003, + "num_tokens": 8360385336.0, + "step": 16354 + }, + { + "epoch": 4.422660897782585, + "grad_norm": 4.0625, + "learning_rate": 0.002622643751917225, + "loss": 2.8924, + "mean_token_accuracy": 0.4221988916397095, + "num_tokens": 8360893408.0, + "step": 16355 + }, + { + "epoch": 4.422931314223905, + "grad_norm": 3.9375, + "learning_rate": 0.002622067697811711, + "loss": 2.9032, + "mean_token_accuracy": 0.4439863860607147, + "num_tokens": 8361405995.0, + "step": 16356 + }, + { + "epoch": 4.423201730665224, + "grad_norm": 3.453125, + "learning_rate": 0.002621491900765799, + "loss": 2.686, + "mean_token_accuracy": 0.4429665803909302, + "num_tokens": 8361930156.0, + "step": 16357 + }, + { + "epoch": 4.423472147106544, + "grad_norm": 3.234375, + "learning_rate": 0.0026209163607971552, + "loss": 2.8391, + "mean_token_accuracy": 0.42957383394241333, + "num_tokens": 8362454176.0, + "step": 16358 + }, + { + "epoch": 4.423742563547863, + "grad_norm": 3.234375, + "learning_rate": 0.0026203410779234372, + "loss": 2.7622, + "mean_token_accuracy": 0.4217398762702942, + "num_tokens": 8362978258.0, + "step": 16359 + }, + { + "epoch": 4.424012979989183, + "grad_norm": 3.828125, + "learning_rate": 0.002619766052162299, + "loss": 2.6967, + "mean_token_accuracy": 0.4731646180152893, + "num_tokens": 8363502523.0, + "step": 16360 + }, + { + "epoch": 4.424283396430503, + "grad_norm": 3.8125, + "learning_rate": 0.002619191283531384, + "loss": 2.6102, + "mean_token_accuracy": 0.4522857666015625, + "num_tokens": 8364026653.0, + "step": 16361 + }, + { + "epoch": 4.424553812871823, + "grad_norm": 3.421875, + "learning_rate": 0.002618616772048324, + "loss": 2.8656, + "mean_token_accuracy": 0.4453248977661133, + "num_tokens": 8364550834.0, + "step": 16362 + }, + { + "epoch": 4.424824229313142, + "grad_norm": 3.828125, + "learning_rate": 0.002618042517730752, + "loss": 2.982, + "mean_token_accuracy": 0.43103253841400146, + "num_tokens": 8365024462.0, + "step": 16363 + }, + { + "epoch": 4.425094645754462, + "grad_norm": 3.765625, + "learning_rate": 0.0026174685205962852, + "loss": 2.779, + "mean_token_accuracy": 0.44454485177993774, + "num_tokens": 8365529137.0, + "step": 16364 + }, + { + "epoch": 4.425365062195781, + "grad_norm": 3.796875, + "learning_rate": 0.0026168947806625336, + "loss": 2.7744, + "mean_token_accuracy": 0.4348997473716736, + "num_tokens": 8366053090.0, + "step": 16365 + }, + { + "epoch": 4.425635478637101, + "grad_norm": 4.28125, + "learning_rate": 0.0026163212979471045, + "loss": 2.6924, + "mean_token_accuracy": 0.4336033761501312, + "num_tokens": 8366577282.0, + "step": 16366 + }, + { + "epoch": 4.4259058950784205, + "grad_norm": 3.421875, + "learning_rate": 0.0026157480724675916, + "loss": 2.6173, + "mean_token_accuracy": 0.44471099972724915, + "num_tokens": 8367101460.0, + "step": 16367 + }, + { + "epoch": 4.426176311519741, + "grad_norm": 3.25, + "learning_rate": 0.0026151751042415867, + "loss": 2.7498, + "mean_token_accuracy": 0.45388713479042053, + "num_tokens": 8367625692.0, + "step": 16368 + }, + { + "epoch": 4.42644672796106, + "grad_norm": 3.0, + "learning_rate": 0.002614602393286666, + "loss": 2.8661, + "mean_token_accuracy": 0.44249239563941956, + "num_tokens": 8368149913.0, + "step": 16369 + }, + { + "epoch": 4.42671714440238, + "grad_norm": 3.46875, + "learning_rate": 0.0026140299396204035, + "loss": 2.5768, + "mean_token_accuracy": 0.4554762840270996, + "num_tokens": 8368673988.0, + "step": 16370 + }, + { + "epoch": 4.426987560843699, + "grad_norm": 28.25, + "learning_rate": 0.002613457743260364, + "loss": 4.0204, + "mean_token_accuracy": 0.35871070623397827, + "num_tokens": 8369198161.0, + "step": 16371 + }, + { + "epoch": 4.427257977285019, + "grad_norm": 6.46875, + "learning_rate": 0.002612885804224106, + "loss": 2.7378, + "mean_token_accuracy": 0.44604402780532837, + "num_tokens": 8369722292.0, + "step": 16372 + }, + { + "epoch": 4.427528393726338, + "grad_norm": 2.734375, + "learning_rate": 0.0026123141225291757, + "loss": 2.6917, + "mean_token_accuracy": 0.4428677260875702, + "num_tokens": 8370246483.0, + "step": 16373 + }, + { + "epoch": 4.427798810167658, + "grad_norm": 4.03125, + "learning_rate": 0.002611742698193113, + "loss": 2.7504, + "mean_token_accuracy": 0.45375317335128784, + "num_tokens": 8370760853.0, + "step": 16374 + }, + { + "epoch": 4.428069226608978, + "grad_norm": 3.890625, + "learning_rate": 0.0026111715312334546, + "loss": 2.7103, + "mean_token_accuracy": 0.44523128867149353, + "num_tokens": 8371285112.0, + "step": 16375 + }, + { + "epoch": 4.428339643050298, + "grad_norm": 3.734375, + "learning_rate": 0.002610600621667723, + "loss": 2.9564, + "mean_token_accuracy": 0.4450111389160156, + "num_tokens": 8371734295.0, + "step": 16376 + }, + { + "epoch": 4.428610059491617, + "grad_norm": 4.09375, + "learning_rate": 0.002610029969513434, + "loss": 2.9082, + "mean_token_accuracy": 0.43532466888427734, + "num_tokens": 8372258573.0, + "step": 16377 + }, + { + "epoch": 4.428880475932937, + "grad_norm": 4.9375, + "learning_rate": 0.0026094595747881023, + "loss": 2.7069, + "mean_token_accuracy": 0.4336894154548645, + "num_tokens": 8372710307.0, + "step": 16378 + }, + { + "epoch": 4.429150892374256, + "grad_norm": 3.109375, + "learning_rate": 0.002608889437509223, + "loss": 2.6532, + "mean_token_accuracy": 0.4611833095550537, + "num_tokens": 8373177963.0, + "step": 16379 + }, + { + "epoch": 4.429421308815576, + "grad_norm": 3.34375, + "learning_rate": 0.0026083195576942935, + "loss": 2.769, + "mean_token_accuracy": 0.43695151805877686, + "num_tokens": 8373702229.0, + "step": 16380 + }, + { + "epoch": 4.4296917252568955, + "grad_norm": 3.90625, + "learning_rate": 0.0026077499353607987, + "loss": 2.8125, + "mean_token_accuracy": 0.4335237741470337, + "num_tokens": 8374226359.0, + "step": 16381 + }, + { + "epoch": 4.429962141698216, + "grad_norm": 3.875, + "learning_rate": 0.0026071805705262154, + "loss": 2.7765, + "mean_token_accuracy": 0.4349059462547302, + "num_tokens": 8374736901.0, + "step": 16382 + }, + { + "epoch": 4.430232558139535, + "grad_norm": 3.859375, + "learning_rate": 0.0026066114632080127, + "loss": 2.8177, + "mean_token_accuracy": 0.44874951243400574, + "num_tokens": 8375261067.0, + "step": 16383 + }, + { + "epoch": 4.430502974580855, + "grad_norm": 4.9375, + "learning_rate": 0.0026060426134236554, + "loss": 2.8689, + "mean_token_accuracy": 0.4475887417793274, + "num_tokens": 8375785303.0, + "step": 16384 + }, + { + "epoch": 4.430773391022174, + "grad_norm": 4.6875, + "learning_rate": 0.002605474021190594, + "loss": 2.6791, + "mean_token_accuracy": 0.44496095180511475, + "num_tokens": 8376309523.0, + "step": 16385 + }, + { + "epoch": 4.431043807463494, + "grad_norm": 5.65625, + "learning_rate": 0.0026049056865262783, + "loss": 2.7901, + "mean_token_accuracy": 0.3994014263153076, + "num_tokens": 8376794911.0, + "step": 16386 + }, + { + "epoch": 4.431314223904813, + "grad_norm": 3.234375, + "learning_rate": 0.0026043376094481443, + "loss": 2.8294, + "mean_token_accuracy": 0.44012561440467834, + "num_tokens": 8377319000.0, + "step": 16387 + }, + { + "epoch": 4.431584640346133, + "grad_norm": 5.0625, + "learning_rate": 0.0026037697899736214, + "loss": 2.8936, + "mean_token_accuracy": 0.42043358087539673, + "num_tokens": 8377843203.0, + "step": 16388 + }, + { + "epoch": 4.431855056787453, + "grad_norm": 3.421875, + "learning_rate": 0.0026032022281201332, + "loss": 2.8842, + "mean_token_accuracy": 0.44179096817970276, + "num_tokens": 8378367383.0, + "step": 16389 + }, + { + "epoch": 4.432125473228773, + "grad_norm": 4.65625, + "learning_rate": 0.002602634923905096, + "loss": 2.8026, + "mean_token_accuracy": 0.42649656534194946, + "num_tokens": 8378891479.0, + "step": 16390 + }, + { + "epoch": 4.432395889670092, + "grad_norm": 24.125, + "learning_rate": 0.002602067877345911, + "loss": 3.3329, + "mean_token_accuracy": 0.4063635468482971, + "num_tokens": 8379347373.0, + "step": 16391 + }, + { + "epoch": 4.432666306111411, + "grad_norm": 8.0, + "learning_rate": 0.0026015010884599832, + "loss": 2.8529, + "mean_token_accuracy": 0.42591071128845215, + "num_tokens": 8379871449.0, + "step": 16392 + }, + { + "epoch": 4.432936722552731, + "grad_norm": 2.953125, + "learning_rate": 0.0026009345572646997, + "loss": 2.6075, + "mean_token_accuracy": 0.44072020053863525, + "num_tokens": 8380395570.0, + "step": 16393 + }, + { + "epoch": 4.433207138994051, + "grad_norm": 3.78125, + "learning_rate": 0.002600368283777442, + "loss": 2.9645, + "mean_token_accuracy": 0.42910459637641907, + "num_tokens": 8380919786.0, + "step": 16394 + }, + { + "epoch": 4.4334775554353705, + "grad_norm": 3.4375, + "learning_rate": 0.0025998022680155893, + "loss": 2.9677, + "mean_token_accuracy": 0.4254041314125061, + "num_tokens": 8381427674.0, + "step": 16395 + }, + { + "epoch": 4.43374797187669, + "grad_norm": 4.21875, + "learning_rate": 0.0025992365099965047, + "loss": 2.6195, + "mean_token_accuracy": 0.4500430226325989, + "num_tokens": 8381951874.0, + "step": 16396 + }, + { + "epoch": 4.43401838831801, + "grad_norm": 2.828125, + "learning_rate": 0.002598671009737551, + "loss": 2.7644, + "mean_token_accuracy": 0.4588398337364197, + "num_tokens": 8382476023.0, + "step": 16397 + }, + { + "epoch": 4.434288804759329, + "grad_norm": 3.421875, + "learning_rate": 0.0025981057672560754, + "loss": 2.7911, + "mean_token_accuracy": 0.43700337409973145, + "num_tokens": 8383000115.0, + "step": 16398 + }, + { + "epoch": 4.434559221200649, + "grad_norm": 4.6875, + "learning_rate": 0.0025975407825694244, + "loss": 2.6193, + "mean_token_accuracy": 0.48162752389907837, + "num_tokens": 8383492659.0, + "step": 16399 + }, + { + "epoch": 4.434829637641968, + "grad_norm": 2.921875, + "learning_rate": 0.002596976055694931, + "loss": 2.7683, + "mean_token_accuracy": 0.4551565647125244, + "num_tokens": 8384016936.0, + "step": 16400 + }, + { + "epoch": 4.435100054083288, + "grad_norm": 3.578125, + "learning_rate": 0.002596411586649926, + "loss": 2.855, + "mean_token_accuracy": 0.44687584042549133, + "num_tokens": 8384541124.0, + "step": 16401 + }, + { + "epoch": 4.4353704705246075, + "grad_norm": 3.53125, + "learning_rate": 0.002595847375451724, + "loss": 2.7626, + "mean_token_accuracy": 0.4483173191547394, + "num_tokens": 8385020662.0, + "step": 16402 + }, + { + "epoch": 4.435640886965928, + "grad_norm": 3.78125, + "learning_rate": 0.0025952834221176424, + "loss": 2.8667, + "mean_token_accuracy": 0.4366716742515564, + "num_tokens": 8385544851.0, + "step": 16403 + }, + { + "epoch": 4.435911303407247, + "grad_norm": 3.96875, + "learning_rate": 0.0025947197266649818, + "loss": 2.7234, + "mean_token_accuracy": 0.43712836503982544, + "num_tokens": 8386069093.0, + "step": 16404 + }, + { + "epoch": 4.436181719848567, + "grad_norm": 3.15625, + "learning_rate": 0.0025941562891110366, + "loss": 2.9929, + "mean_token_accuracy": 0.39639371633529663, + "num_tokens": 8386593337.0, + "step": 16405 + }, + { + "epoch": 4.436452136289886, + "grad_norm": 3.890625, + "learning_rate": 0.0025935931094730968, + "loss": 2.9081, + "mean_token_accuracy": 0.4425649642944336, + "num_tokens": 8387117448.0, + "step": 16406 + }, + { + "epoch": 4.436722552731206, + "grad_norm": 4.28125, + "learning_rate": 0.0025930301877684432, + "loss": 2.5864, + "mean_token_accuracy": 0.44885408878326416, + "num_tokens": 8387604681.0, + "step": 16407 + }, + { + "epoch": 4.436992969172525, + "grad_norm": 8.75, + "learning_rate": 0.002592467524014345, + "loss": 2.6887, + "mean_token_accuracy": 0.44966310262680054, + "num_tokens": 8388128925.0, + "step": 16408 + }, + { + "epoch": 4.4372633856138455, + "grad_norm": 2.578125, + "learning_rate": 0.00259190511822807, + "loss": 2.9362, + "mean_token_accuracy": 0.4336605370044708, + "num_tokens": 8388631480.0, + "step": 16409 + }, + { + "epoch": 4.437533802055165, + "grad_norm": 3.59375, + "learning_rate": 0.0025913429704268708, + "loss": 2.8012, + "mean_token_accuracy": 0.4508505165576935, + "num_tokens": 8389070182.0, + "step": 16410 + }, + { + "epoch": 4.437804218496485, + "grad_norm": 28.25, + "learning_rate": 0.002590781080628, + "loss": 2.8395, + "mean_token_accuracy": 0.4452752470970154, + "num_tokens": 8389543940.0, + "step": 16411 + }, + { + "epoch": 4.438074634937804, + "grad_norm": 7.1875, + "learning_rate": 0.0025902194488486925, + "loss": 2.745, + "mean_token_accuracy": 0.4478169083595276, + "num_tokens": 8390068084.0, + "step": 16412 + }, + { + "epoch": 4.438345051379124, + "grad_norm": 3.515625, + "learning_rate": 0.002589658075106186, + "loss": 2.6869, + "mean_token_accuracy": 0.44584426283836365, + "num_tokens": 8390592191.0, + "step": 16413 + }, + { + "epoch": 4.438615467820443, + "grad_norm": 5.125, + "learning_rate": 0.0025890969594177015, + "loss": 2.5891, + "mean_token_accuracy": 0.46146273612976074, + "num_tokens": 8391116366.0, + "step": 16414 + }, + { + "epoch": 4.438885884261763, + "grad_norm": 3.671875, + "learning_rate": 0.0025885361018004576, + "loss": 2.8687, + "mean_token_accuracy": 0.4095146954059601, + "num_tokens": 8391640592.0, + "step": 16415 + }, + { + "epoch": 4.4391563007030825, + "grad_norm": 4.09375, + "learning_rate": 0.0025879755022716645, + "loss": 2.7845, + "mean_token_accuracy": 0.43700194358825684, + "num_tokens": 8392164832.0, + "step": 16416 + }, + { + "epoch": 4.439426717144403, + "grad_norm": 3.984375, + "learning_rate": 0.002587415160848518, + "loss": 2.7474, + "mean_token_accuracy": 0.4522416293621063, + "num_tokens": 8392688942.0, + "step": 16417 + }, + { + "epoch": 4.439697133585722, + "grad_norm": 4.1875, + "learning_rate": 0.002586855077548217, + "loss": 2.6109, + "mean_token_accuracy": 0.45524823665618896, + "num_tokens": 8393154745.0, + "step": 16418 + }, + { + "epoch": 4.439967550027042, + "grad_norm": 2.90625, + "learning_rate": 0.0025862952523879428, + "loss": 2.8604, + "mean_token_accuracy": 0.442931592464447, + "num_tokens": 8393637968.0, + "step": 16419 + }, + { + "epoch": 4.440237966468361, + "grad_norm": 2.9375, + "learning_rate": 0.0025857356853848713, + "loss": 2.5439, + "mean_token_accuracy": 0.4533114433288574, + "num_tokens": 8394162217.0, + "step": 16420 + }, + { + "epoch": 4.440508382909681, + "grad_norm": 3.296875, + "learning_rate": 0.002585176376556177, + "loss": 2.7448, + "mean_token_accuracy": 0.44787898659706116, + "num_tokens": 8394686442.0, + "step": 16421 + }, + { + "epoch": 4.440778799351, + "grad_norm": 3.765625, + "learning_rate": 0.0025846173259190166, + "loss": 2.9689, + "mean_token_accuracy": 0.4359835386276245, + "num_tokens": 8395185940.0, + "step": 16422 + }, + { + "epoch": 4.4410492157923205, + "grad_norm": 4.90625, + "learning_rate": 0.0025840585334905427, + "loss": 2.7231, + "mean_token_accuracy": 0.43528592586517334, + "num_tokens": 8395710173.0, + "step": 16423 + }, + { + "epoch": 4.44131963223364, + "grad_norm": 3.125, + "learning_rate": 0.0025834999992879054, + "loss": 2.6212, + "mean_token_accuracy": 0.4501786231994629, + "num_tokens": 8396234389.0, + "step": 16424 + }, + { + "epoch": 4.44159004867496, + "grad_norm": 3.375, + "learning_rate": 0.002582941723328237, + "loss": 2.8094, + "mean_token_accuracy": 0.450847327709198, + "num_tokens": 8396748191.0, + "step": 16425 + }, + { + "epoch": 4.441860465116279, + "grad_norm": 3.296875, + "learning_rate": 0.0025823837056286702, + "loss": 2.8237, + "mean_token_accuracy": 0.4437583088874817, + "num_tokens": 8397237084.0, + "step": 16426 + }, + { + "epoch": 4.442130881557599, + "grad_norm": 3.78125, + "learning_rate": 0.002581825946206327, + "loss": 2.5984, + "mean_token_accuracy": 0.4439130425453186, + "num_tokens": 8397761194.0, + "step": 16427 + }, + { + "epoch": 4.442401297998918, + "grad_norm": 3.296875, + "learning_rate": 0.0025812684450783187, + "loss": 2.8913, + "mean_token_accuracy": 0.4426555633544922, + "num_tokens": 8398285384.0, + "step": 16428 + }, + { + "epoch": 4.442671714440238, + "grad_norm": 5.03125, + "learning_rate": 0.0025807112022617536, + "loss": 2.6761, + "mean_token_accuracy": 0.4493188261985779, + "num_tokens": 8398809538.0, + "step": 16429 + }, + { + "epoch": 4.4429421308815575, + "grad_norm": 3.875, + "learning_rate": 0.002580154217773728, + "loss": 2.7264, + "mean_token_accuracy": 0.44988495111465454, + "num_tokens": 8399333658.0, + "step": 16430 + }, + { + "epoch": 4.443212547322878, + "grad_norm": 24.625, + "learning_rate": 0.0025795974916313314, + "loss": 3.0294, + "mean_token_accuracy": 0.448788583278656, + "num_tokens": 8399857935.0, + "step": 16431 + }, + { + "epoch": 4.443482963764197, + "grad_norm": 9.5, + "learning_rate": 0.0025790410238516483, + "loss": 2.9239, + "mean_token_accuracy": 0.42899078130722046, + "num_tokens": 8400333538.0, + "step": 16432 + }, + { + "epoch": 4.443753380205516, + "grad_norm": 2.75, + "learning_rate": 0.00257848481445175, + "loss": 2.912, + "mean_token_accuracy": 0.40120750665664673, + "num_tokens": 8400857581.0, + "step": 16433 + }, + { + "epoch": 4.444023796646836, + "grad_norm": 3.34375, + "learning_rate": 0.002577928863448703, + "loss": 2.6946, + "mean_token_accuracy": 0.45898064970970154, + "num_tokens": 8401325280.0, + "step": 16434 + }, + { + "epoch": 4.444294213088156, + "grad_norm": 2.625, + "learning_rate": 0.002577373170859568, + "loss": 2.8032, + "mean_token_accuracy": 0.4309353232383728, + "num_tokens": 8401849479.0, + "step": 16435 + }, + { + "epoch": 4.444564629529475, + "grad_norm": 3.578125, + "learning_rate": 0.0025768177367013916, + "loss": 2.78, + "mean_token_accuracy": 0.4334942698478699, + "num_tokens": 8402373737.0, + "step": 16436 + }, + { + "epoch": 4.444835045970795, + "grad_norm": 3.640625, + "learning_rate": 0.0025762625609912176, + "loss": 2.9269, + "mean_token_accuracy": 0.4217159152030945, + "num_tokens": 8402897925.0, + "step": 16437 + }, + { + "epoch": 4.445105462412115, + "grad_norm": 3.65625, + "learning_rate": 0.002575707643746083, + "loss": 2.6813, + "mean_token_accuracy": 0.4453771710395813, + "num_tokens": 8403422118.0, + "step": 16438 + }, + { + "epoch": 4.445375878853434, + "grad_norm": 3.84375, + "learning_rate": 0.0025751529849830095, + "loss": 2.6106, + "mean_token_accuracy": 0.4535347521305084, + "num_tokens": 8403946287.0, + "step": 16439 + }, + { + "epoch": 4.445646295294754, + "grad_norm": 3.4375, + "learning_rate": 0.002574598584719019, + "loss": 2.5527, + "mean_token_accuracy": 0.46039527654647827, + "num_tokens": 8404410442.0, + "step": 16440 + }, + { + "epoch": 4.445916711736073, + "grad_norm": 3.46875, + "learning_rate": 0.002574044442971123, + "loss": 2.8359, + "mean_token_accuracy": 0.441039502620697, + "num_tokens": 8404934592.0, + "step": 16441 + }, + { + "epoch": 4.446187128177393, + "grad_norm": 4.125, + "learning_rate": 0.0025734905597563223, + "loss": 2.6431, + "mean_token_accuracy": 0.43671637773513794, + "num_tokens": 8405458727.0, + "step": 16442 + }, + { + "epoch": 4.4464575446187125, + "grad_norm": 3.609375, + "learning_rate": 0.0025729369350916105, + "loss": 3.0174, + "mean_token_accuracy": 0.4364057779312134, + "num_tokens": 8405959897.0, + "step": 16443 + }, + { + "epoch": 4.4467279610600325, + "grad_norm": 4.40625, + "learning_rate": 0.0025723835689939775, + "loss": 2.9484, + "mean_token_accuracy": 0.44936996698379517, + "num_tokens": 8406447038.0, + "step": 16444 + }, + { + "epoch": 4.446998377501352, + "grad_norm": 3.859375, + "learning_rate": 0.0025718304614804, + "loss": 2.7025, + "mean_token_accuracy": 0.4371190071105957, + "num_tokens": 8406971148.0, + "step": 16445 + }, + { + "epoch": 4.447268793942672, + "grad_norm": 3.453125, + "learning_rate": 0.0025712776125678487, + "loss": 2.6588, + "mean_token_accuracy": 0.44176894426345825, + "num_tokens": 8407495415.0, + "step": 16446 + }, + { + "epoch": 4.447539210383991, + "grad_norm": 11.5, + "learning_rate": 0.0025707250222732894, + "loss": 2.6887, + "mean_token_accuracy": 0.47438251972198486, + "num_tokens": 8408019681.0, + "step": 16447 + }, + { + "epoch": 4.447809626825311, + "grad_norm": 3.59375, + "learning_rate": 0.0025701726906136734, + "loss": 2.6644, + "mean_token_accuracy": 0.4482729136943817, + "num_tokens": 8408543877.0, + "step": 16448 + }, + { + "epoch": 4.44808004326663, + "grad_norm": 23.625, + "learning_rate": 0.00256962061760595, + "loss": 2.5576, + "mean_token_accuracy": 0.4996035695075989, + "num_tokens": 8409068151.0, + "step": 16449 + }, + { + "epoch": 4.44835045970795, + "grad_norm": 5.4375, + "learning_rate": 0.0025690688032670595, + "loss": 2.6266, + "mean_token_accuracy": 0.4946328401565552, + "num_tokens": 8409592344.0, + "step": 16450 + }, + { + "epoch": 4.44862087614927, + "grad_norm": 41.75, + "learning_rate": 0.002568517247613931, + "loss": 3.8201, + "mean_token_accuracy": 0.3766576051712036, + "num_tokens": 8410002823.0, + "step": 16451 + }, + { + "epoch": 4.44889129259059, + "grad_norm": 5.5, + "learning_rate": 0.002567965950663489, + "loss": 2.8755, + "mean_token_accuracy": 0.4262886643409729, + "num_tokens": 8410496031.0, + "step": 16452 + }, + { + "epoch": 4.449161709031909, + "grad_norm": 2.921875, + "learning_rate": 0.0025674149124326477, + "loss": 2.8833, + "mean_token_accuracy": 0.41267502307891846, + "num_tokens": 8411020306.0, + "step": 16453 + }, + { + "epoch": 4.449432125473229, + "grad_norm": 3.078125, + "learning_rate": 0.0025668641329383173, + "loss": 2.9106, + "mean_token_accuracy": 0.44277098774909973, + "num_tokens": 8411496703.0, + "step": 16454 + }, + { + "epoch": 4.449702541914548, + "grad_norm": 3.59375, + "learning_rate": 0.0025663136121973937, + "loss": 2.7828, + "mean_token_accuracy": 0.43460988998413086, + "num_tokens": 8412020907.0, + "step": 16455 + }, + { + "epoch": 4.449972958355868, + "grad_norm": 3.46875, + "learning_rate": 0.002565763350226771, + "loss": 2.7345, + "mean_token_accuracy": 0.44832947850227356, + "num_tokens": 8412539036.0, + "step": 16456 + }, + { + "epoch": 4.4502433747971875, + "grad_norm": 3.140625, + "learning_rate": 0.0025652133470433314, + "loss": 2.7086, + "mean_token_accuracy": 0.4554256200790405, + "num_tokens": 8413063199.0, + "step": 16457 + }, + { + "epoch": 4.4505137912385075, + "grad_norm": 3.765625, + "learning_rate": 0.0025646636026639535, + "loss": 2.8394, + "mean_token_accuracy": 0.4284290671348572, + "num_tokens": 8413546720.0, + "step": 16458 + }, + { + "epoch": 4.450784207679827, + "grad_norm": 3.0625, + "learning_rate": 0.002564114117105502, + "loss": 2.8352, + "mean_token_accuracy": 0.4812133312225342, + "num_tokens": 8413960079.0, + "step": 16459 + }, + { + "epoch": 4.451054624121147, + "grad_norm": 4.0625, + "learning_rate": 0.002563564890384836, + "loss": 2.8218, + "mean_token_accuracy": 0.4435797929763794, + "num_tokens": 8414457208.0, + "step": 16460 + }, + { + "epoch": 4.451325040562466, + "grad_norm": 3.84375, + "learning_rate": 0.0025630159225188123, + "loss": 2.7657, + "mean_token_accuracy": 0.4503435790538788, + "num_tokens": 8414981434.0, + "step": 16461 + }, + { + "epoch": 4.451595457003786, + "grad_norm": 3.328125, + "learning_rate": 0.00256246721352427, + "loss": 2.5701, + "mean_token_accuracy": 0.4399433135986328, + "num_tokens": 8415494174.0, + "step": 16462 + }, + { + "epoch": 4.451865873445105, + "grad_norm": 4.5, + "learning_rate": 0.0025619187634180456, + "loss": 2.7395, + "mean_token_accuracy": 0.4299362897872925, + "num_tokens": 8416018300.0, + "step": 16463 + }, + { + "epoch": 4.452136289886425, + "grad_norm": 5.21875, + "learning_rate": 0.0025613705722169707, + "loss": 2.6975, + "mean_token_accuracy": 0.4520718455314636, + "num_tokens": 8416542574.0, + "step": 16464 + }, + { + "epoch": 4.452406706327745, + "grad_norm": 5.53125, + "learning_rate": 0.0025608226399378626, + "loss": 2.8567, + "mean_token_accuracy": 0.4485960006713867, + "num_tokens": 8416999121.0, + "step": 16465 + }, + { + "epoch": 4.452677122769065, + "grad_norm": 4.84375, + "learning_rate": 0.0025602749665975316, + "loss": 2.7815, + "mean_token_accuracy": 0.4768678843975067, + "num_tokens": 8417465947.0, + "step": 16466 + }, + { + "epoch": 4.452947539210384, + "grad_norm": 3.90625, + "learning_rate": 0.0025597275522127863, + "loss": 2.715, + "mean_token_accuracy": 0.45643240213394165, + "num_tokens": 8417990191.0, + "step": 16467 + }, + { + "epoch": 4.453217955651704, + "grad_norm": 3.8125, + "learning_rate": 0.00255918039680042, + "loss": 2.7143, + "mean_token_accuracy": 0.45372241735458374, + "num_tokens": 8418475015.0, + "step": 16468 + }, + { + "epoch": 4.453488372093023, + "grad_norm": 4.0625, + "learning_rate": 0.002558633500377221, + "loss": 2.5973, + "mean_token_accuracy": 0.46374812722206116, + "num_tokens": 8418999053.0, + "step": 16469 + }, + { + "epoch": 4.453758788534343, + "grad_norm": 3.421875, + "learning_rate": 0.0025580868629599724, + "loss": 2.6891, + "mean_token_accuracy": 0.4586786925792694, + "num_tokens": 8419523210.0, + "step": 16470 + }, + { + "epoch": 4.4540292049756625, + "grad_norm": 296.0, + "learning_rate": 0.0025575404845654434, + "loss": 6.1584, + "mean_token_accuracy": 0.12649290263652802, + "num_tokens": 8420047491.0, + "step": 16471 + }, + { + "epoch": 4.4542996214169825, + "grad_norm": 7.65625, + "learning_rate": 0.0025569943652104022, + "loss": 2.8898, + "mean_token_accuracy": 0.4367654025554657, + "num_tokens": 8420571775.0, + "step": 16472 + }, + { + "epoch": 4.454570037858302, + "grad_norm": 2.453125, + "learning_rate": 0.0025564485049116028, + "loss": 2.6694, + "mean_token_accuracy": 0.450348436832428, + "num_tokens": 8421095899.0, + "step": 16473 + }, + { + "epoch": 4.454840454299622, + "grad_norm": 3.578125, + "learning_rate": 0.0025559029036857916, + "loss": 2.5826, + "mean_token_accuracy": 0.44912418723106384, + "num_tokens": 8421579185.0, + "step": 16474 + }, + { + "epoch": 4.455110870740941, + "grad_norm": 2.8125, + "learning_rate": 0.002555357561549715, + "loss": 2.7658, + "mean_token_accuracy": 0.4375578463077545, + "num_tokens": 8422103216.0, + "step": 16475 + }, + { + "epoch": 4.455381287182261, + "grad_norm": 3.125, + "learning_rate": 0.002554812478520101, + "loss": 2.8211, + "mean_token_accuracy": 0.4383431077003479, + "num_tokens": 8422627410.0, + "step": 16476 + }, + { + "epoch": 4.45565170362358, + "grad_norm": 3.546875, + "learning_rate": 0.002554267654613675, + "loss": 2.641, + "mean_token_accuracy": 0.46536141633987427, + "num_tokens": 8423126680.0, + "step": 16477 + }, + { + "epoch": 4.4559221200648995, + "grad_norm": 3.78125, + "learning_rate": 0.0025537230898471557, + "loss": 2.8379, + "mean_token_accuracy": 0.44769734144210815, + "num_tokens": 8423650893.0, + "step": 16478 + }, + { + "epoch": 4.45619253650622, + "grad_norm": 3.8125, + "learning_rate": 0.0025531787842372517, + "loss": 2.8387, + "mean_token_accuracy": 0.43955671787261963, + "num_tokens": 8424175163.0, + "step": 16479 + }, + { + "epoch": 4.456462952947539, + "grad_norm": 4.25, + "learning_rate": 0.0025526347378006615, + "loss": 2.7579, + "mean_token_accuracy": 0.4351159334182739, + "num_tokens": 8424699346.0, + "step": 16480 + }, + { + "epoch": 4.456733369388859, + "grad_norm": 6.6875, + "learning_rate": 0.0025520909505540802, + "loss": 2.7357, + "mean_token_accuracy": 0.4515797793865204, + "num_tokens": 8425215766.0, + "step": 16481 + }, + { + "epoch": 4.457003785830178, + "grad_norm": 2.75, + "learning_rate": 0.0025515474225141915, + "loss": 2.5669, + "mean_token_accuracy": 0.4607924222946167, + "num_tokens": 8425739933.0, + "step": 16482 + }, + { + "epoch": 4.457274202271498, + "grad_norm": 4.4375, + "learning_rate": 0.0025510041536976726, + "loss": 2.7188, + "mean_token_accuracy": 0.44683849811553955, + "num_tokens": 8426264217.0, + "step": 16483 + }, + { + "epoch": 4.457544618712817, + "grad_norm": 3.671875, + "learning_rate": 0.002550461144121195, + "loss": 2.7487, + "mean_token_accuracy": 0.4365094304084778, + "num_tokens": 8426788407.0, + "step": 16484 + }, + { + "epoch": 4.4578150351541375, + "grad_norm": 3.984375, + "learning_rate": 0.0025499183938014177, + "loss": 2.8659, + "mean_token_accuracy": 0.4424070715904236, + "num_tokens": 8427261262.0, + "step": 16485 + }, + { + "epoch": 4.458085451595457, + "grad_norm": 4.40625, + "learning_rate": 0.0025493759027549927, + "loss": 2.5112, + "mean_token_accuracy": 0.4949107766151428, + "num_tokens": 8427785547.0, + "step": 16486 + }, + { + "epoch": 4.458355868036777, + "grad_norm": 3.296875, + "learning_rate": 0.0025488336709985687, + "loss": 2.6557, + "mean_token_accuracy": 0.4310852885246277, + "num_tokens": 8428309735.0, + "step": 16487 + }, + { + "epoch": 4.458626284478096, + "grad_norm": 3.75, + "learning_rate": 0.0025482916985487807, + "loss": 2.8347, + "mean_token_accuracy": 0.43073180317878723, + "num_tokens": 8428833973.0, + "step": 16488 + }, + { + "epoch": 4.458896700919416, + "grad_norm": 4.03125, + "learning_rate": 0.002547749985422256, + "loss": 2.5464, + "mean_token_accuracy": 0.44588619470596313, + "num_tokens": 8429358033.0, + "step": 16489 + }, + { + "epoch": 4.459167117360735, + "grad_norm": 3.734375, + "learning_rate": 0.002547208531635621, + "loss": 2.7881, + "mean_token_accuracy": 0.42938798666000366, + "num_tokens": 8429882226.0, + "step": 16490 + }, + { + "epoch": 4.459437533802055, + "grad_norm": 25.5, + "learning_rate": 0.0025466673372054856, + "loss": 2.7674, + "mean_token_accuracy": 0.39977210760116577, + "num_tokens": 8430330075.0, + "step": 16491 + }, + { + "epoch": 4.4597079502433745, + "grad_norm": 7.96875, + "learning_rate": 0.0025461264021484544, + "loss": 2.8242, + "mean_token_accuracy": 0.42808595299720764, + "num_tokens": 8430854322.0, + "step": 16492 + }, + { + "epoch": 4.459978366684695, + "grad_norm": 3.1875, + "learning_rate": 0.0025455857264811282, + "loss": 2.7701, + "mean_token_accuracy": 0.4333919882774353, + "num_tokens": 8431335870.0, + "step": 16493 + }, + { + "epoch": 4.460248783126014, + "grad_norm": 4.53125, + "learning_rate": 0.0025450453102200932, + "loss": 2.7031, + "mean_token_accuracy": 0.440060019493103, + "num_tokens": 8431860001.0, + "step": 16494 + }, + { + "epoch": 4.460519199567334, + "grad_norm": 3.234375, + "learning_rate": 0.0025445051533819337, + "loss": 2.5974, + "mean_token_accuracy": 0.4569343030452728, + "num_tokens": 8432357414.0, + "step": 16495 + }, + { + "epoch": 4.460789616008653, + "grad_norm": 3.3125, + "learning_rate": 0.0025439652559832203, + "loss": 2.8751, + "mean_token_accuracy": 0.4421747028827667, + "num_tokens": 8432881687.0, + "step": 16496 + }, + { + "epoch": 4.461060032449973, + "grad_norm": 4.15625, + "learning_rate": 0.002543425618040523, + "loss": 2.8684, + "mean_token_accuracy": 0.40991008281707764, + "num_tokens": 8433387475.0, + "step": 16497 + }, + { + "epoch": 4.461330448891292, + "grad_norm": 4.34375, + "learning_rate": 0.0025428862395703937, + "loss": 2.7282, + "mean_token_accuracy": 0.453655868768692, + "num_tokens": 8433872099.0, + "step": 16498 + }, + { + "epoch": 4.4616008653326125, + "grad_norm": 3.5625, + "learning_rate": 0.0025423471205893882, + "loss": 2.817, + "mean_token_accuracy": 0.4507431089878082, + "num_tokens": 8434396360.0, + "step": 16499 + }, + { + "epoch": 4.461871281773932, + "grad_norm": 4.125, + "learning_rate": 0.002541808261114043, + "loss": 2.7874, + "mean_token_accuracy": 0.4363147020339966, + "num_tokens": 8434909668.0, + "step": 16500 + }, + { + "epoch": 4.462141698215252, + "grad_norm": 2.953125, + "learning_rate": 0.002541269661160897, + "loss": 2.815, + "mean_token_accuracy": 0.4418941140174866, + "num_tokens": 8435431877.0, + "step": 16501 + }, + { + "epoch": 4.462412114656571, + "grad_norm": 4.125, + "learning_rate": 0.002540731320746473, + "loss": 2.7516, + "mean_token_accuracy": 0.43218767642974854, + "num_tokens": 8435945381.0, + "step": 16502 + }, + { + "epoch": 4.462682531097891, + "grad_norm": 3.890625, + "learning_rate": 0.0025401932398872863, + "loss": 2.6927, + "mean_token_accuracy": 0.4635586738586426, + "num_tokens": 8436428876.0, + "step": 16503 + }, + { + "epoch": 4.46295294753921, + "grad_norm": 4.21875, + "learning_rate": 0.0025396554185998523, + "loss": 3.0172, + "mean_token_accuracy": 0.42694124579429626, + "num_tokens": 8436941728.0, + "step": 16504 + }, + { + "epoch": 4.46322336398053, + "grad_norm": 4.15625, + "learning_rate": 0.0025391178569006702, + "loss": 2.8061, + "mean_token_accuracy": 0.45126640796661377, + "num_tokens": 8437408387.0, + "step": 16505 + }, + { + "epoch": 4.4634937804218495, + "grad_norm": 4.46875, + "learning_rate": 0.0025385805548062326, + "loss": 2.6728, + "mean_token_accuracy": 0.4399447739124298, + "num_tokens": 8437932626.0, + "step": 16506 + }, + { + "epoch": 4.46376419686317, + "grad_norm": 3.484375, + "learning_rate": 0.0025380435123330282, + "loss": 2.9381, + "mean_token_accuracy": 0.4244117736816406, + "num_tokens": 8438456909.0, + "step": 16507 + }, + { + "epoch": 4.464034613304489, + "grad_norm": 4.4375, + "learning_rate": 0.0025375067294975333, + "loss": 2.6648, + "mean_token_accuracy": 0.4554251432418823, + "num_tokens": 8438981123.0, + "step": 16508 + }, + { + "epoch": 4.464305029745809, + "grad_norm": 3.8125, + "learning_rate": 0.002536970206316219, + "loss": 2.6226, + "mean_token_accuracy": 0.4672815799713135, + "num_tokens": 8439505349.0, + "step": 16509 + }, + { + "epoch": 4.464575446187128, + "grad_norm": 3.84375, + "learning_rate": 0.0025364339428055457, + "loss": 2.5444, + "mean_token_accuracy": 0.4549027383327484, + "num_tokens": 8440029607.0, + "step": 16510 + }, + { + "epoch": 4.464845862628448, + "grad_norm": 65.0, + "learning_rate": 0.00253589793898197, + "loss": 3.5169, + "mean_token_accuracy": 0.3646414875984192, + "num_tokens": 8440553846.0, + "step": 16511 + }, + { + "epoch": 4.465116279069767, + "grad_norm": 9.375, + "learning_rate": 0.0025353621948619354, + "loss": 2.7563, + "mean_token_accuracy": 0.4236043691635132, + "num_tokens": 8441078042.0, + "step": 16512 + }, + { + "epoch": 4.4653866955110875, + "grad_norm": 3.09375, + "learning_rate": 0.002534826710461883, + "loss": 2.7983, + "mean_token_accuracy": 0.43636369705200195, + "num_tokens": 8441602232.0, + "step": 16513 + }, + { + "epoch": 4.465657111952407, + "grad_norm": 5.5, + "learning_rate": 0.002534291485798242, + "loss": 2.751, + "mean_token_accuracy": 0.4742598533630371, + "num_tokens": 8442126512.0, + "step": 16514 + }, + { + "epoch": 4.465927528393727, + "grad_norm": 4.625, + "learning_rate": 0.002533756520887432, + "loss": 2.6706, + "mean_token_accuracy": 0.49974745512008667, + "num_tokens": 8442650758.0, + "step": 16515 + }, + { + "epoch": 4.466197944835046, + "grad_norm": 2.796875, + "learning_rate": 0.002533221815745872, + "loss": 2.8726, + "mean_token_accuracy": 0.44479289650917053, + "num_tokens": 8443174972.0, + "step": 16516 + }, + { + "epoch": 4.466468361276366, + "grad_norm": 4.0, + "learning_rate": 0.002532687370389964, + "loss": 3.1034, + "mean_token_accuracy": 0.4054664969444275, + "num_tokens": 8443640156.0, + "step": 16517 + }, + { + "epoch": 4.466738777717685, + "grad_norm": 3.78125, + "learning_rate": 0.002532153184836109, + "loss": 2.6607, + "mean_token_accuracy": 0.4509789049625397, + "num_tokens": 8444164335.0, + "step": 16518 + }, + { + "epoch": 4.467009194159004, + "grad_norm": 3.65625, + "learning_rate": 0.0025316192591006976, + "loss": 2.6878, + "mean_token_accuracy": 0.4575078785419464, + "num_tokens": 8444655990.0, + "step": 16519 + }, + { + "epoch": 4.4672796106003245, + "grad_norm": 3.65625, + "learning_rate": 0.002531085593200109, + "loss": 2.986, + "mean_token_accuracy": 0.4480419158935547, + "num_tokens": 8445097624.0, + "step": 16520 + }, + { + "epoch": 4.467550027041644, + "grad_norm": 3.359375, + "learning_rate": 0.002530552187150722, + "loss": 2.7292, + "mean_token_accuracy": 0.42724454402923584, + "num_tokens": 8445621860.0, + "step": 16521 + }, + { + "epoch": 4.467820443482964, + "grad_norm": 3.328125, + "learning_rate": 0.0025300190409689007, + "loss": 2.9233, + "mean_token_accuracy": 0.4360572099685669, + "num_tokens": 8446116994.0, + "step": 16522 + }, + { + "epoch": 4.468090859924283, + "grad_norm": 3.78125, + "learning_rate": 0.0025294861546710013, + "loss": 2.9636, + "mean_token_accuracy": 0.4360628128051758, + "num_tokens": 8446641254.0, + "step": 16523 + }, + { + "epoch": 4.468361276365603, + "grad_norm": 4.125, + "learning_rate": 0.0025289535282733795, + "loss": 2.6462, + "mean_token_accuracy": 0.45271992683410645, + "num_tokens": 8447165383.0, + "step": 16524 + }, + { + "epoch": 4.468631692806922, + "grad_norm": 3.59375, + "learning_rate": 0.002528421161792374, + "loss": 2.8839, + "mean_token_accuracy": 0.43737709522247314, + "num_tokens": 8447689606.0, + "step": 16525 + }, + { + "epoch": 4.468902109248242, + "grad_norm": 3.3125, + "learning_rate": 0.0025278890552443197, + "loss": 2.7713, + "mean_token_accuracy": 0.4146166145801544, + "num_tokens": 8448208744.0, + "step": 16526 + }, + { + "epoch": 4.469172525689562, + "grad_norm": 2.953125, + "learning_rate": 0.0025273572086455458, + "loss": 2.9003, + "mean_token_accuracy": 0.4299677014350891, + "num_tokens": 8448706557.0, + "step": 16527 + }, + { + "epoch": 4.469442942130882, + "grad_norm": 3.65625, + "learning_rate": 0.0025268256220123686, + "loss": 2.6578, + "mean_token_accuracy": 0.44106513261795044, + "num_tokens": 8449230646.0, + "step": 16528 + }, + { + "epoch": 4.469713358572201, + "grad_norm": 3.15625, + "learning_rate": 0.0025262942953610994, + "loss": 2.7078, + "mean_token_accuracy": 0.44335874915122986, + "num_tokens": 8449754918.0, + "step": 16529 + }, + { + "epoch": 4.469983775013521, + "grad_norm": 3.421875, + "learning_rate": 0.002525763228708041, + "loss": 2.7541, + "mean_token_accuracy": 0.45084306597709656, + "num_tokens": 8450255319.0, + "step": 16530 + }, + { + "epoch": 4.47025419145484, + "grad_norm": 65.5, + "learning_rate": 0.002525232422069488, + "loss": 3.1891, + "mean_token_accuracy": 0.38568949699401855, + "num_tokens": 8450756234.0, + "step": 16531 + }, + { + "epoch": 4.47052460789616, + "grad_norm": 6.59375, + "learning_rate": 0.002524701875461725, + "loss": 2.8449, + "mean_token_accuracy": 0.4539908766746521, + "num_tokens": 8451250945.0, + "step": 16532 + }, + { + "epoch": 4.470795024337479, + "grad_norm": 3.28125, + "learning_rate": 0.002524171588901036, + "loss": 2.9005, + "mean_token_accuracy": 0.4155155122280121, + "num_tokens": 8451726818.0, + "step": 16533 + }, + { + "epoch": 4.4710654407787995, + "grad_norm": 3.8125, + "learning_rate": 0.0025236415624036866, + "loss": 2.837, + "mean_token_accuracy": 0.4350569248199463, + "num_tokens": 8452227467.0, + "step": 16534 + }, + { + "epoch": 4.471335857220119, + "grad_norm": 3.3125, + "learning_rate": 0.0025231117959859405, + "loss": 2.7034, + "mean_token_accuracy": 0.4459332227706909, + "num_tokens": 8452751573.0, + "step": 16535 + }, + { + "epoch": 4.471606273661439, + "grad_norm": 4.40625, + "learning_rate": 0.002522582289664055, + "loss": 2.7349, + "mean_token_accuracy": 0.4402064085006714, + "num_tokens": 8453275823.0, + "step": 16536 + }, + { + "epoch": 4.471876690102758, + "grad_norm": 3.59375, + "learning_rate": 0.002522053043454274, + "loss": 2.8266, + "mean_token_accuracy": 0.43835294246673584, + "num_tokens": 8453789568.0, + "step": 16537 + }, + { + "epoch": 4.472147106544078, + "grad_norm": 3.921875, + "learning_rate": 0.002521524057372838, + "loss": 2.7156, + "mean_token_accuracy": 0.44791534543037415, + "num_tokens": 8454313671.0, + "step": 16538 + }, + { + "epoch": 4.472417522985397, + "grad_norm": 5.0, + "learning_rate": 0.002520995331435978, + "loss": 2.838, + "mean_token_accuracy": 0.4545590877532959, + "num_tokens": 8454815732.0, + "step": 16539 + }, + { + "epoch": 4.472687939426717, + "grad_norm": 3.546875, + "learning_rate": 0.002520466865659914, + "loss": 2.7031, + "mean_token_accuracy": 0.446615993976593, + "num_tokens": 8455312916.0, + "step": 16540 + }, + { + "epoch": 4.472958355868037, + "grad_norm": 4.8125, + "learning_rate": 0.0025199386600608664, + "loss": 2.8063, + "mean_token_accuracy": 0.43082356452941895, + "num_tokens": 8455811669.0, + "step": 16541 + }, + { + "epoch": 4.473228772309357, + "grad_norm": 3.53125, + "learning_rate": 0.002519410714655039, + "loss": 2.7936, + "mean_token_accuracy": 0.4403107762336731, + "num_tokens": 8456335868.0, + "step": 16542 + }, + { + "epoch": 4.473499188750676, + "grad_norm": 4.21875, + "learning_rate": 0.002518883029458628, + "loss": 2.6654, + "mean_token_accuracy": 0.44518768787384033, + "num_tokens": 8456859975.0, + "step": 16543 + }, + { + "epoch": 4.473769605191996, + "grad_norm": 3.625, + "learning_rate": 0.00251835560448783, + "loss": 2.6879, + "mean_token_accuracy": 0.45142483711242676, + "num_tokens": 8457384131.0, + "step": 16544 + }, + { + "epoch": 4.474040021633315, + "grad_norm": 3.96875, + "learning_rate": 0.002517828439758823, + "loss": 2.7709, + "mean_token_accuracy": 0.4377082586288452, + "num_tokens": 8457908251.0, + "step": 16545 + }, + { + "epoch": 4.474310438074635, + "grad_norm": 3.28125, + "learning_rate": 0.0025173015352877837, + "loss": 2.6146, + "mean_token_accuracy": 0.44977036118507385, + "num_tokens": 8458411064.0, + "step": 16546 + }, + { + "epoch": 4.474580854515954, + "grad_norm": 3.046875, + "learning_rate": 0.0025167748910908806, + "loss": 2.8842, + "mean_token_accuracy": 0.44114962220191956, + "num_tokens": 8458935340.0, + "step": 16547 + }, + { + "epoch": 4.4748512709572745, + "grad_norm": 4.3125, + "learning_rate": 0.0025162485071842716, + "loss": 2.8495, + "mean_token_accuracy": 0.4307428300380707, + "num_tokens": 8459459476.0, + "step": 16548 + }, + { + "epoch": 4.475121687398594, + "grad_norm": 3.59375, + "learning_rate": 0.002515722383584105, + "loss": 2.7656, + "mean_token_accuracy": 0.44279196858406067, + "num_tokens": 8459952479.0, + "step": 16549 + }, + { + "epoch": 4.475392103839914, + "grad_norm": 3.546875, + "learning_rate": 0.0025151965203065296, + "loss": 2.8864, + "mean_token_accuracy": 0.46473634243011475, + "num_tokens": 8460450188.0, + "step": 16550 + }, + { + "epoch": 4.475662520281233, + "grad_norm": 29.875, + "learning_rate": 0.002514670917367675, + "loss": 2.3318, + "mean_token_accuracy": 0.46618378162384033, + "num_tokens": 8460876248.0, + "step": 16551 + }, + { + "epoch": 4.475932936722553, + "grad_norm": 7.59375, + "learning_rate": 0.0025141455747836705, + "loss": 2.789, + "mean_token_accuracy": 0.4191247224807739, + "num_tokens": 8461400489.0, + "step": 16552 + }, + { + "epoch": 4.476203353163872, + "grad_norm": 2.953125, + "learning_rate": 0.002513620492570635, + "loss": 2.7652, + "mean_token_accuracy": 0.4464494585990906, + "num_tokens": 8461870296.0, + "step": 16553 + }, + { + "epoch": 4.476473769605192, + "grad_norm": 3.71875, + "learning_rate": 0.00251309567074468, + "loss": 2.8401, + "mean_token_accuracy": 0.4360639154911041, + "num_tokens": 8462379600.0, + "step": 16554 + }, + { + "epoch": 4.476744186046512, + "grad_norm": 19.0, + "learning_rate": 0.002512571109321908, + "loss": 2.5782, + "mean_token_accuracy": 0.5092371702194214, + "num_tokens": 8462855121.0, + "step": 16555 + }, + { + "epoch": 4.477014602487832, + "grad_norm": 3.484375, + "learning_rate": 0.0025120468083184157, + "loss": 2.8415, + "mean_token_accuracy": 0.4272240996360779, + "num_tokens": 8463379399.0, + "step": 16556 + }, + { + "epoch": 4.477285018929151, + "grad_norm": 3.46875, + "learning_rate": 0.0025115227677502865, + "loss": 2.7032, + "mean_token_accuracy": 0.44322848320007324, + "num_tokens": 8463903571.0, + "step": 16557 + }, + { + "epoch": 4.477555435370471, + "grad_norm": 3.6875, + "learning_rate": 0.0025109989876336025, + "loss": 2.8015, + "mean_token_accuracy": 0.43761932849884033, + "num_tokens": 8464420101.0, + "step": 16558 + }, + { + "epoch": 4.47782585181179, + "grad_norm": 3.28125, + "learning_rate": 0.0025104754679844354, + "loss": 2.6439, + "mean_token_accuracy": 0.46385008096694946, + "num_tokens": 8464944379.0, + "step": 16559 + }, + { + "epoch": 4.478096268253109, + "grad_norm": 3.90625, + "learning_rate": 0.0025099522088188465, + "loss": 2.87, + "mean_token_accuracy": 0.4225115478038788, + "num_tokens": 8465468658.0, + "step": 16560 + }, + { + "epoch": 4.4783666846944294, + "grad_norm": 3.34375, + "learning_rate": 0.0025094292101528904, + "loss": 2.5725, + "mean_token_accuracy": 0.46466678380966187, + "num_tokens": 8465981226.0, + "step": 16561 + }, + { + "epoch": 4.478637101135749, + "grad_norm": 3.09375, + "learning_rate": 0.0025089064720026154, + "loss": 2.7831, + "mean_token_accuracy": 0.4432249069213867, + "num_tokens": 8466505499.0, + "step": 16562 + }, + { + "epoch": 4.478907517577069, + "grad_norm": 4.09375, + "learning_rate": 0.0025083839943840607, + "loss": 2.8757, + "mean_token_accuracy": 0.4354461431503296, + "num_tokens": 8467029735.0, + "step": 16563 + }, + { + "epoch": 4.479177934018388, + "grad_norm": 3.921875, + "learning_rate": 0.0025078617773132586, + "loss": 2.7987, + "mean_token_accuracy": 0.45307981967926025, + "num_tokens": 8467553915.0, + "step": 16564 + }, + { + "epoch": 4.479448350459708, + "grad_norm": 3.90625, + "learning_rate": 0.0025073398208062298, + "loss": 2.9252, + "mean_token_accuracy": 0.43941134214401245, + "num_tokens": 8468024389.0, + "step": 16565 + }, + { + "epoch": 4.479718766901027, + "grad_norm": 4.0625, + "learning_rate": 0.0025068181248789896, + "loss": 2.9262, + "mean_token_accuracy": 0.4357268214225769, + "num_tokens": 8468534325.0, + "step": 16566 + }, + { + "epoch": 4.479989183342347, + "grad_norm": 3.578125, + "learning_rate": 0.0025062966895475474, + "loss": 2.6189, + "mean_token_accuracy": 0.4508683681488037, + "num_tokens": 8469058449.0, + "step": 16567 + }, + { + "epoch": 4.4802595997836665, + "grad_norm": 3.203125, + "learning_rate": 0.0025057755148279004, + "loss": 2.6021, + "mean_token_accuracy": 0.45709139108657837, + "num_tokens": 8469582676.0, + "step": 16568 + }, + { + "epoch": 4.480530016224987, + "grad_norm": 3.78125, + "learning_rate": 0.002505254600736041, + "loss": 2.7534, + "mean_token_accuracy": 0.44449281692504883, + "num_tokens": 8470101578.0, + "step": 16569 + }, + { + "epoch": 4.480800432666306, + "grad_norm": 4.03125, + "learning_rate": 0.002504733947287952, + "loss": 2.8551, + "mean_token_accuracy": 0.4377523958683014, + "num_tokens": 8470625833.0, + "step": 16570 + }, + { + "epoch": 4.481070849107626, + "grad_norm": 65.0, + "learning_rate": 0.0025042135544996085, + "loss": 4.2361, + "mean_token_accuracy": 0.34870630502700806, + "num_tokens": 8471150093.0, + "step": 16571 + }, + { + "epoch": 4.481341265548945, + "grad_norm": 9.1875, + "learning_rate": 0.002503693422386976, + "loss": 2.6662, + "mean_token_accuracy": 0.4421020746231079, + "num_tokens": 8471674312.0, + "step": 16572 + }, + { + "epoch": 4.481611681990265, + "grad_norm": 3.109375, + "learning_rate": 0.0025031735509660185, + "loss": 2.7591, + "mean_token_accuracy": 0.4390721023082733, + "num_tokens": 8472198456.0, + "step": 16573 + }, + { + "epoch": 4.481882098431584, + "grad_norm": 4.125, + "learning_rate": 0.0025026539402526822, + "loss": 2.9003, + "mean_token_accuracy": 0.4577823579311371, + "num_tokens": 8472674084.0, + "step": 16574 + }, + { + "epoch": 4.4821525148729044, + "grad_norm": 3.0625, + "learning_rate": 0.002502134590262911, + "loss": 2.7745, + "mean_token_accuracy": 0.453192800283432, + "num_tokens": 8473198329.0, + "step": 16575 + }, + { + "epoch": 4.482422931314224, + "grad_norm": 4.28125, + "learning_rate": 0.0025016155010126433, + "loss": 2.5417, + "mean_token_accuracy": 0.5065575242042542, + "num_tokens": 8473657952.0, + "step": 16576 + }, + { + "epoch": 4.482693347755544, + "grad_norm": 3.6875, + "learning_rate": 0.0025010966725178036, + "loss": 2.6064, + "mean_token_accuracy": 0.45415323972702026, + "num_tokens": 8474182162.0, + "step": 16577 + }, + { + "epoch": 4.482963764196863, + "grad_norm": 3.71875, + "learning_rate": 0.0025005781047943098, + "loss": 2.8547, + "mean_token_accuracy": 0.44760221242904663, + "num_tokens": 8474706369.0, + "step": 16578 + }, + { + "epoch": 4.483234180638183, + "grad_norm": 3.640625, + "learning_rate": 0.0025000597978580776, + "loss": 2.6338, + "mean_token_accuracy": 0.44570016860961914, + "num_tokens": 8475230507.0, + "step": 16579 + }, + { + "epoch": 4.483504597079502, + "grad_norm": 2.9375, + "learning_rate": 0.0024995417517250057, + "loss": 2.6718, + "mean_token_accuracy": 0.46152031421661377, + "num_tokens": 8475754770.0, + "step": 16580 + }, + { + "epoch": 4.483775013520822, + "grad_norm": 2.671875, + "learning_rate": 0.0024990239664109916, + "loss": 2.8724, + "mean_token_accuracy": 0.4382488429546356, + "num_tokens": 8476278979.0, + "step": 16581 + }, + { + "epoch": 4.4840454299621415, + "grad_norm": 3.5, + "learning_rate": 0.0024985064419319205, + "loss": 2.816, + "mean_token_accuracy": 0.4294404089450836, + "num_tokens": 8476803074.0, + "step": 16582 + }, + { + "epoch": 4.484315846403462, + "grad_norm": 3.921875, + "learning_rate": 0.002497989178303675, + "loss": 2.6777, + "mean_token_accuracy": 0.4411315321922302, + "num_tokens": 8477327282.0, + "step": 16583 + }, + { + "epoch": 4.484586262844781, + "grad_norm": 3.71875, + "learning_rate": 0.002497472175542123, + "loss": 2.7632, + "mean_token_accuracy": 0.44223886728286743, + "num_tokens": 8477799970.0, + "step": 16584 + }, + { + "epoch": 4.484856679286101, + "grad_norm": 3.46875, + "learning_rate": 0.002496955433663131, + "loss": 2.8482, + "mean_token_accuracy": 0.4471997916698456, + "num_tokens": 8478324243.0, + "step": 16585 + }, + { + "epoch": 4.48512709572742, + "grad_norm": 3.609375, + "learning_rate": 0.0024964389526825493, + "loss": 2.714, + "mean_token_accuracy": 0.4431247115135193, + "num_tokens": 8478848421.0, + "step": 16586 + }, + { + "epoch": 4.48539751216874, + "grad_norm": 3.75, + "learning_rate": 0.0024959227326162296, + "loss": 2.8093, + "mean_token_accuracy": 0.44291621446609497, + "num_tokens": 8479372572.0, + "step": 16587 + }, + { + "epoch": 4.485667928610059, + "grad_norm": 4.03125, + "learning_rate": 0.00249540677348001, + "loss": 2.7985, + "mean_token_accuracy": 0.44960421323776245, + "num_tokens": 8479847353.0, + "step": 16588 + }, + { + "epoch": 4.4859383450513795, + "grad_norm": 4.1875, + "learning_rate": 0.0024948910752897193, + "loss": 2.864, + "mean_token_accuracy": 0.4169211983680725, + "num_tokens": 8480371629.0, + "step": 16589 + }, + { + "epoch": 4.486208761492699, + "grad_norm": 3.640625, + "learning_rate": 0.0024943756380611834, + "loss": 2.8998, + "mean_token_accuracy": 0.43710243701934814, + "num_tokens": 8480895722.0, + "step": 16590 + }, + { + "epoch": 4.486479177934019, + "grad_norm": 37.0, + "learning_rate": 0.0024938604618102173, + "loss": 2.3696, + "mean_token_accuracy": 0.4744327962398529, + "num_tokens": 8481382124.0, + "step": 16591 + }, + { + "epoch": 4.486749594375338, + "grad_norm": 8.3125, + "learning_rate": 0.002493345546552625, + "loss": 2.6854, + "mean_token_accuracy": 0.43906813859939575, + "num_tokens": 8481906265.0, + "step": 16592 + }, + { + "epoch": 4.487020010816658, + "grad_norm": 3.6875, + "learning_rate": 0.002492830892304209, + "loss": 2.8936, + "mean_token_accuracy": 0.44708457589149475, + "num_tokens": 8482412464.0, + "step": 16593 + }, + { + "epoch": 4.487290427257977, + "grad_norm": 4.6875, + "learning_rate": 0.0024923164990807594, + "loss": 2.6742, + "mean_token_accuracy": 0.44792139530181885, + "num_tokens": 8482896810.0, + "step": 16594 + }, + { + "epoch": 4.487560843699297, + "grad_norm": 3.453125, + "learning_rate": 0.002491802366898057, + "loss": 2.66, + "mean_token_accuracy": 0.45201802253723145, + "num_tokens": 8483420935.0, + "step": 16595 + }, + { + "epoch": 4.4878312601406165, + "grad_norm": 3.6875, + "learning_rate": 0.00249128849577188, + "loss": 2.6439, + "mean_token_accuracy": 0.46419209241867065, + "num_tokens": 8483902060.0, + "step": 16596 + }, + { + "epoch": 4.488101676581937, + "grad_norm": 4.0, + "learning_rate": 0.0024907748857179953, + "loss": 2.4329, + "mean_token_accuracy": 0.4631946384906769, + "num_tokens": 8484426095.0, + "step": 16597 + }, + { + "epoch": 4.488372093023256, + "grad_norm": 3.84375, + "learning_rate": 0.00249026153675216, + "loss": 2.722, + "mean_token_accuracy": 0.4434136152267456, + "num_tokens": 8484950278.0, + "step": 16598 + }, + { + "epoch": 4.488642509464576, + "grad_norm": 8.0625, + "learning_rate": 0.0024897484488901277, + "loss": 2.5406, + "mean_token_accuracy": 0.448768675327301, + "num_tokens": 8485474534.0, + "step": 16599 + }, + { + "epoch": 4.488912925905895, + "grad_norm": 3.015625, + "learning_rate": 0.002489235622147639, + "loss": 2.8658, + "mean_token_accuracy": 0.42603784799575806, + "num_tokens": 8485998638.0, + "step": 16600 + }, + { + "epoch": 4.489183342347214, + "grad_norm": 3.78125, + "learning_rate": 0.00248872305654043, + "loss": 2.7728, + "mean_token_accuracy": 0.45411980152130127, + "num_tokens": 8486522797.0, + "step": 16601 + }, + { + "epoch": 4.489453758788534, + "grad_norm": 3.609375, + "learning_rate": 0.002488210752084228, + "loss": 2.9246, + "mean_token_accuracy": 0.43452686071395874, + "num_tokens": 8487047040.0, + "step": 16602 + }, + { + "epoch": 4.489724175229854, + "grad_norm": 3.984375, + "learning_rate": 0.002487698708794752, + "loss": 2.7893, + "mean_token_accuracy": 0.4507364332675934, + "num_tokens": 8487571232.0, + "step": 16603 + }, + { + "epoch": 4.489994591671174, + "grad_norm": 3.609375, + "learning_rate": 0.0024871869266877115, + "loss": 2.9801, + "mean_token_accuracy": 0.4321896433830261, + "num_tokens": 8488095310.0, + "step": 16604 + }, + { + "epoch": 4.490265008112493, + "grad_norm": 4.0, + "learning_rate": 0.0024866754057788127, + "loss": 2.8551, + "mean_token_accuracy": 0.46893638372421265, + "num_tokens": 8488556398.0, + "step": 16605 + }, + { + "epoch": 4.490535424553813, + "grad_norm": 4.90625, + "learning_rate": 0.002486164146083747, + "loss": 2.9476, + "mean_token_accuracy": 0.43002015352249146, + "num_tokens": 8489042028.0, + "step": 16606 + }, + { + "epoch": 4.490805840995132, + "grad_norm": 4.15625, + "learning_rate": 0.0024856531476182024, + "loss": 2.6403, + "mean_token_accuracy": 0.4462894797325134, + "num_tokens": 8489512824.0, + "step": 16607 + }, + { + "epoch": 4.491076257436452, + "grad_norm": 4.1875, + "learning_rate": 0.0024851424103978594, + "loss": 3.0316, + "mean_token_accuracy": 0.42254549264907837, + "num_tokens": 8490037034.0, + "step": 16608 + }, + { + "epoch": 4.491346673877771, + "grad_norm": 5.46875, + "learning_rate": 0.002484631934438386, + "loss": 2.8548, + "mean_token_accuracy": 0.4301377236843109, + "num_tokens": 8490561222.0, + "step": 16609 + }, + { + "epoch": 4.4916170903190915, + "grad_norm": 4.625, + "learning_rate": 0.002484121719755448, + "loss": 2.7039, + "mean_token_accuracy": 0.4498557150363922, + "num_tokens": 8491085494.0, + "step": 16610 + }, + { + "epoch": 4.491887506760411, + "grad_norm": 150.0, + "learning_rate": 0.0024836117663647006, + "loss": 7.1764, + "mean_token_accuracy": 0.2217942774295807, + "num_tokens": 8491579251.0, + "step": 16611 + }, + { + "epoch": 4.492157923201731, + "grad_norm": 7.90625, + "learning_rate": 0.002483102074281787, + "loss": 2.602, + "mean_token_accuracy": 0.4584488272666931, + "num_tokens": 8492103518.0, + "step": 16612 + }, + { + "epoch": 4.49242833964305, + "grad_norm": 2.703125, + "learning_rate": 0.002482592643522349, + "loss": 2.798, + "mean_token_accuracy": 0.44467902183532715, + "num_tokens": 8492606742.0, + "step": 16613 + }, + { + "epoch": 4.49269875608437, + "grad_norm": 3.78125, + "learning_rate": 0.002482083474102018, + "loss": 2.887, + "mean_token_accuracy": 0.42710429430007935, + "num_tokens": 8493130890.0, + "step": 16614 + }, + { + "epoch": 4.492969172525689, + "grad_norm": 3.546875, + "learning_rate": 0.0024815745660364146, + "loss": 2.8857, + "mean_token_accuracy": 0.42092207074165344, + "num_tokens": 8493655060.0, + "step": 16615 + }, + { + "epoch": 4.493239588967009, + "grad_norm": 4.625, + "learning_rate": 0.0024810659193411555, + "loss": 3.018, + "mean_token_accuracy": 0.4176672399044037, + "num_tokens": 8494177162.0, + "step": 16616 + }, + { + "epoch": 4.493510005408329, + "grad_norm": 4.3125, + "learning_rate": 0.002480557534031846, + "loss": 2.7453, + "mean_token_accuracy": 0.4432486295700073, + "num_tokens": 8494701405.0, + "step": 16617 + }, + { + "epoch": 4.493780421849649, + "grad_norm": 3.890625, + "learning_rate": 0.0024800494101240847, + "loss": 2.5329, + "mean_token_accuracy": 0.48670923709869385, + "num_tokens": 8495120197.0, + "step": 16618 + }, + { + "epoch": 4.494050838290968, + "grad_norm": 3.84375, + "learning_rate": 0.002479541547633464, + "loss": 2.5812, + "mean_token_accuracy": 0.465338796377182, + "num_tokens": 8495644392.0, + "step": 16619 + }, + { + "epoch": 4.494321254732288, + "grad_norm": 4.4375, + "learning_rate": 0.002479033946575565, + "loss": 2.9388, + "mean_token_accuracy": 0.42227357625961304, + "num_tokens": 8496168587.0, + "step": 16620 + }, + { + "epoch": 4.494591671173607, + "grad_norm": 4.65625, + "learning_rate": 0.0024785266069659627, + "loss": 2.7139, + "mean_token_accuracy": 0.46325212717056274, + "num_tokens": 8496692774.0, + "step": 16621 + }, + { + "epoch": 4.494862087614927, + "grad_norm": 4.59375, + "learning_rate": 0.002478019528820225, + "loss": 2.5086, + "mean_token_accuracy": 0.4869842529296875, + "num_tokens": 8497217030.0, + "step": 16622 + }, + { + "epoch": 4.495132504056246, + "grad_norm": 3.65625, + "learning_rate": 0.0024775127121539086, + "loss": 2.6943, + "mean_token_accuracy": 0.4445174038410187, + "num_tokens": 8497741178.0, + "step": 16623 + }, + { + "epoch": 4.4954029204975665, + "grad_norm": 3.34375, + "learning_rate": 0.002477006156982566, + "loss": 2.7671, + "mean_token_accuracy": 0.43500816822052, + "num_tokens": 8498265393.0, + "step": 16624 + }, + { + "epoch": 4.495673336938886, + "grad_norm": 4.5625, + "learning_rate": 0.002476499863321739, + "loss": 2.718, + "mean_token_accuracy": 0.4379740357398987, + "num_tokens": 8498789629.0, + "step": 16625 + }, + { + "epoch": 4.495943753380206, + "grad_norm": 3.59375, + "learning_rate": 0.0024759938311869627, + "loss": 2.8082, + "mean_token_accuracy": 0.43442046642303467, + "num_tokens": 8499313841.0, + "step": 16626 + }, + { + "epoch": 4.496214169821525, + "grad_norm": 3.71875, + "learning_rate": 0.002475488060593761, + "loss": 2.8221, + "mean_token_accuracy": 0.43730032444000244, + "num_tokens": 8499834494.0, + "step": 16627 + }, + { + "epoch": 4.496484586262845, + "grad_norm": 4.0, + "learning_rate": 0.0024749825515576565, + "loss": 2.7695, + "mean_token_accuracy": 0.45248979330062866, + "num_tokens": 8500358734.0, + "step": 16628 + }, + { + "epoch": 4.496755002704164, + "grad_norm": 3.921875, + "learning_rate": 0.0024744773040941584, + "loss": 2.8841, + "mean_token_accuracy": 0.443245530128479, + "num_tokens": 8500824697.0, + "step": 16629 + }, + { + "epoch": 4.497025419145484, + "grad_norm": 3.6875, + "learning_rate": 0.002473972318218767, + "loss": 2.6766, + "mean_token_accuracy": 0.4347914755344391, + "num_tokens": 8501297330.0, + "step": 16630 + }, + { + "epoch": 4.497295835586804, + "grad_norm": 28.75, + "learning_rate": 0.0024734675939469784, + "loss": 3.4948, + "mean_token_accuracy": 0.37090691924095154, + "num_tokens": 8501821518.0, + "step": 16631 + }, + { + "epoch": 4.497566252028124, + "grad_norm": 7.46875, + "learning_rate": 0.002472963131294279, + "loss": 2.7431, + "mean_token_accuracy": 0.4429481625556946, + "num_tokens": 8502345782.0, + "step": 16632 + }, + { + "epoch": 4.497836668469443, + "grad_norm": 2.75, + "learning_rate": 0.0024724589302761476, + "loss": 2.8423, + "mean_token_accuracy": 0.427567720413208, + "num_tokens": 8502869955.0, + "step": 16633 + }, + { + "epoch": 4.498107084910763, + "grad_norm": 3.359375, + "learning_rate": 0.0024719549909080546, + "loss": 2.744, + "mean_token_accuracy": 0.4408799409866333, + "num_tokens": 8503394156.0, + "step": 16634 + }, + { + "epoch": 4.498377501352082, + "grad_norm": 2.75, + "learning_rate": 0.0024714513132054606, + "loss": 2.8081, + "mean_token_accuracy": 0.44406527280807495, + "num_tokens": 8503918433.0, + "step": 16635 + }, + { + "epoch": 4.498647917793402, + "grad_norm": 3.515625, + "learning_rate": 0.002470947897183822, + "loss": 2.7981, + "mean_token_accuracy": 0.4336734116077423, + "num_tokens": 8504396299.0, + "step": 16636 + }, + { + "epoch": 4.498918334234721, + "grad_norm": 3.65625, + "learning_rate": 0.0024704447428585846, + "loss": 2.8102, + "mean_token_accuracy": 0.445944219827652, + "num_tokens": 8504920583.0, + "step": 16637 + }, + { + "epoch": 4.4991887506760415, + "grad_norm": 4.0625, + "learning_rate": 0.0024699418502451844, + "loss": 2.6775, + "mean_token_accuracy": 0.44326621294021606, + "num_tokens": 8505444830.0, + "step": 16638 + }, + { + "epoch": 4.499459167117361, + "grad_norm": 3.625, + "learning_rate": 0.0024694392193590543, + "loss": 2.8169, + "mean_token_accuracy": 0.4384918808937073, + "num_tokens": 8505968951.0, + "step": 16639 + }, + { + "epoch": 4.499729583558681, + "grad_norm": 4.0625, + "learning_rate": 0.0024689368502156164, + "loss": 2.8352, + "mean_token_accuracy": 0.4370215833187103, + "num_tokens": 8506493145.0, + "step": 16640 + }, + { + "epoch": 4.5, + "grad_norm": 3.328125, + "learning_rate": 0.0024684347428302828, + "loss": 2.7549, + "mean_token_accuracy": 0.42066043615341187, + "num_tokens": 8507017407.0, + "step": 16641 + }, + { + "epoch": 4.500270416441319, + "grad_norm": 3.40625, + "learning_rate": 0.0024679328972184622, + "loss": 2.7111, + "mean_token_accuracy": 0.4421449303627014, + "num_tokens": 8507541499.0, + "step": 16642 + }, + { + "epoch": 4.500540832882639, + "grad_norm": 3.328125, + "learning_rate": 0.002467431313395552, + "loss": 2.6632, + "mean_token_accuracy": 0.45381081104278564, + "num_tokens": 8508023453.0, + "step": 16643 + }, + { + "epoch": 4.500811249323959, + "grad_norm": 3.671875, + "learning_rate": 0.0024669299913769403, + "loss": 2.9687, + "mean_token_accuracy": 0.4274132251739502, + "num_tokens": 8508547691.0, + "step": 16644 + }, + { + "epoch": 4.501081665765279, + "grad_norm": 3.515625, + "learning_rate": 0.0024664289311780115, + "loss": 2.7812, + "mean_token_accuracy": 0.46158304810523987, + "num_tokens": 8509048913.0, + "step": 16645 + }, + { + "epoch": 4.501352082206598, + "grad_norm": 3.640625, + "learning_rate": 0.002465928132814139, + "loss": 2.8089, + "mean_token_accuracy": 0.4232032597064972, + "num_tokens": 8509572954.0, + "step": 16646 + }, + { + "epoch": 4.501622498647918, + "grad_norm": 3.78125, + "learning_rate": 0.0024654275963006873, + "loss": 2.9239, + "mean_token_accuracy": 0.4240994155406952, + "num_tokens": 8510097130.0, + "step": 16647 + }, + { + "epoch": 4.501892915089237, + "grad_norm": 4.3125, + "learning_rate": 0.0024649273216530163, + "loss": 2.852, + "mean_token_accuracy": 0.42124438285827637, + "num_tokens": 8510621187.0, + "step": 16648 + }, + { + "epoch": 4.502163331530557, + "grad_norm": 90.0, + "learning_rate": 0.0024644273088864762, + "loss": 3.0804, + "mean_token_accuracy": 0.4173068404197693, + "num_tokens": 8511145280.0, + "step": 16649 + }, + { + "epoch": 4.502433747971876, + "grad_norm": 7.0, + "learning_rate": 0.0024639275580164055, + "loss": 2.7989, + "mean_token_accuracy": 0.44186297059059143, + "num_tokens": 8511669293.0, + "step": 16650 + }, + { + "epoch": 4.502704164413196, + "grad_norm": 390.0, + "learning_rate": 0.0024634280690581413, + "loss": 6.4665, + "mean_token_accuracy": 0.1676875650882721, + "num_tokens": 8512193492.0, + "step": 16651 + }, + { + "epoch": 4.502974580854516, + "grad_norm": 6.96875, + "learning_rate": 0.002462928842027008, + "loss": 2.9055, + "mean_token_accuracy": 0.42330795526504517, + "num_tokens": 8512680520.0, + "step": 16652 + }, + { + "epoch": 4.503244997295836, + "grad_norm": 2.734375, + "learning_rate": 0.002462429876938323, + "loss": 2.7885, + "mean_token_accuracy": 0.4524303674697876, + "num_tokens": 8513204701.0, + "step": 16653 + }, + { + "epoch": 4.503515413737155, + "grad_norm": 3.96875, + "learning_rate": 0.002461931173807398, + "loss": 2.7518, + "mean_token_accuracy": 0.46978336572647095, + "num_tokens": 8513728928.0, + "step": 16654 + }, + { + "epoch": 4.503785830178475, + "grad_norm": 4.375, + "learning_rate": 0.002461432732649533, + "loss": 2.8352, + "mean_token_accuracy": 0.418087363243103, + "num_tokens": 8514253054.0, + "step": 16655 + }, + { + "epoch": 4.504056246619794, + "grad_norm": 3.15625, + "learning_rate": 0.0024609345534800224, + "loss": 2.8256, + "mean_token_accuracy": 0.444680780172348, + "num_tokens": 8514777195.0, + "step": 16656 + }, + { + "epoch": 4.504326663061114, + "grad_norm": 3.71875, + "learning_rate": 0.0024604366363141523, + "loss": 3.0833, + "mean_token_accuracy": 0.428036093711853, + "num_tokens": 8515255876.0, + "step": 16657 + }, + { + "epoch": 4.5045970795024335, + "grad_norm": 3.421875, + "learning_rate": 0.0024599389811671975, + "loss": 2.7928, + "mean_token_accuracy": 0.44368094205856323, + "num_tokens": 8515736399.0, + "step": 16658 + }, + { + "epoch": 4.504867495943754, + "grad_norm": 3.203125, + "learning_rate": 0.0024594415880544304, + "loss": 2.6847, + "mean_token_accuracy": 0.4434550404548645, + "num_tokens": 8516260647.0, + "step": 16659 + }, + { + "epoch": 4.505137912385073, + "grad_norm": 3.703125, + "learning_rate": 0.0024589444569911123, + "loss": 2.8989, + "mean_token_accuracy": 0.4301132559776306, + "num_tokens": 8516784850.0, + "step": 16660 + }, + { + "epoch": 4.505408328826393, + "grad_norm": 10.625, + "learning_rate": 0.002458447587992494, + "loss": 2.6766, + "mean_token_accuracy": 0.46419480443000793, + "num_tokens": 8517309091.0, + "step": 16661 + }, + { + "epoch": 4.505678745267712, + "grad_norm": 3.578125, + "learning_rate": 0.0024579509810738237, + "loss": 2.6032, + "mean_token_accuracy": 0.4453861117362976, + "num_tokens": 8517833238.0, + "step": 16662 + }, + { + "epoch": 4.505949161709032, + "grad_norm": 3.40625, + "learning_rate": 0.0024574546362503393, + "loss": 2.7216, + "mean_token_accuracy": 0.44359642267227173, + "num_tokens": 8518357486.0, + "step": 16663 + }, + { + "epoch": 4.506219578150351, + "grad_norm": 3.921875, + "learning_rate": 0.0024569585535372663, + "loss": 2.7491, + "mean_token_accuracy": 0.4570571184158325, + "num_tokens": 8518881611.0, + "step": 16664 + }, + { + "epoch": 4.506489994591671, + "grad_norm": 3.578125, + "learning_rate": 0.0024564627329498293, + "loss": 2.7382, + "mean_token_accuracy": 0.43573999404907227, + "num_tokens": 8519405872.0, + "step": 16665 + }, + { + "epoch": 4.506760411032991, + "grad_norm": 3.46875, + "learning_rate": 0.0024559671745032403, + "loss": 2.6776, + "mean_token_accuracy": 0.4656660556793213, + "num_tokens": 8519917310.0, + "step": 16666 + }, + { + "epoch": 4.507030827474311, + "grad_norm": 3.25, + "learning_rate": 0.002455471878212706, + "loss": 2.7478, + "mean_token_accuracy": 0.44138818979263306, + "num_tokens": 8520441556.0, + "step": 16667 + }, + { + "epoch": 4.50730124391563, + "grad_norm": 3.46875, + "learning_rate": 0.00245497684409342, + "loss": 2.7324, + "mean_token_accuracy": 0.44845014810562134, + "num_tokens": 8520965671.0, + "step": 16668 + }, + { + "epoch": 4.50757166035695, + "grad_norm": 4.625, + "learning_rate": 0.0024544820721605753, + "loss": 2.7343, + "mean_token_accuracy": 0.46288400888442993, + "num_tokens": 8521441204.0, + "step": 16669 + }, + { + "epoch": 4.507842076798269, + "grad_norm": 4.3125, + "learning_rate": 0.0024539875624293504, + "loss": 2.6216, + "mean_token_accuracy": 0.4574260115623474, + "num_tokens": 8521965387.0, + "step": 16670 + }, + { + "epoch": 4.508112493239589, + "grad_norm": 28.375, + "learning_rate": 0.002453493314914921, + "loss": 3.3744, + "mean_token_accuracy": 0.39222806692123413, + "num_tokens": 8522489626.0, + "step": 16671 + }, + { + "epoch": 4.5083829096809085, + "grad_norm": 6.8125, + "learning_rate": 0.00245299932963245, + "loss": 2.9803, + "mean_token_accuracy": 0.410402774810791, + "num_tokens": 8523013902.0, + "step": 16672 + }, + { + "epoch": 4.508653326122229, + "grad_norm": 3.203125, + "learning_rate": 0.0024525056065970922, + "loss": 2.9, + "mean_token_accuracy": 0.41714078187942505, + "num_tokens": 8523538145.0, + "step": 16673 + }, + { + "epoch": 4.508923742563548, + "grad_norm": 3.359375, + "learning_rate": 0.0024520121458240013, + "loss": 2.6045, + "mean_token_accuracy": 0.4460197985172272, + "num_tokens": 8524062407.0, + "step": 16674 + }, + { + "epoch": 4.509194159004868, + "grad_norm": 3.8125, + "learning_rate": 0.002451518947328316, + "loss": 2.8997, + "mean_token_accuracy": 0.42634350061416626, + "num_tokens": 8524586650.0, + "step": 16675 + }, + { + "epoch": 4.509464575446187, + "grad_norm": 4.78125, + "learning_rate": 0.002451026011125167, + "loss": 2.7433, + "mean_token_accuracy": 0.44687533378601074, + "num_tokens": 8525110929.0, + "step": 16676 + }, + { + "epoch": 4.509734991887507, + "grad_norm": 3.828125, + "learning_rate": 0.0024505333372296825, + "loss": 2.7376, + "mean_token_accuracy": 0.44275081157684326, + "num_tokens": 8525635200.0, + "step": 16677 + }, + { + "epoch": 4.510005408328826, + "grad_norm": 3.625, + "learning_rate": 0.0024500409256569757, + "loss": 2.9499, + "mean_token_accuracy": 0.4673798680305481, + "num_tokens": 8525991944.0, + "step": 16678 + }, + { + "epoch": 4.5102758247701455, + "grad_norm": 3.625, + "learning_rate": 0.0024495487764221585, + "loss": 2.5212, + "mean_token_accuracy": 0.48869821429252625, + "num_tokens": 8526453277.0, + "step": 16679 + }, + { + "epoch": 4.510546241211466, + "grad_norm": 2.765625, + "learning_rate": 0.0024490568895403297, + "loss": 2.8087, + "mean_token_accuracy": 0.43774065375328064, + "num_tokens": 8526977448.0, + "step": 16680 + }, + { + "epoch": 4.510816657652786, + "grad_norm": 3.3125, + "learning_rate": 0.002448565265026583, + "loss": 2.7445, + "mean_token_accuracy": 0.43647295236587524, + "num_tokens": 8527501654.0, + "step": 16681 + }, + { + "epoch": 4.511087074094105, + "grad_norm": 4.03125, + "learning_rate": 0.0024480739028960004, + "loss": 2.8567, + "mean_token_accuracy": 0.4442365765571594, + "num_tokens": 8527982595.0, + "step": 16682 + }, + { + "epoch": 4.511357490535424, + "grad_norm": 3.78125, + "learning_rate": 0.0024475828031636617, + "loss": 2.7472, + "mean_token_accuracy": 0.44087526202201843, + "num_tokens": 8528494091.0, + "step": 16683 + }, + { + "epoch": 4.511627906976744, + "grad_norm": 3.703125, + "learning_rate": 0.0024470919658446323, + "loss": 2.8056, + "mean_token_accuracy": 0.41983482241630554, + "num_tokens": 8529018261.0, + "step": 16684 + }, + { + "epoch": 4.511898323418064, + "grad_norm": 3.640625, + "learning_rate": 0.0024466013909539754, + "loss": 2.8356, + "mean_token_accuracy": 0.4441608786582947, + "num_tokens": 8529542431.0, + "step": 16685 + }, + { + "epoch": 4.5121687398593835, + "grad_norm": 4.03125, + "learning_rate": 0.002446111078506741, + "loss": 2.6919, + "mean_token_accuracy": 0.43956705927848816, + "num_tokens": 8530066713.0, + "step": 16686 + }, + { + "epoch": 4.512439156300703, + "grad_norm": 3.390625, + "learning_rate": 0.002445621028517974, + "loss": 2.6799, + "mean_token_accuracy": 0.46700620651245117, + "num_tokens": 8530527339.0, + "step": 16687 + }, + { + "epoch": 4.512709572742023, + "grad_norm": 3.75, + "learning_rate": 0.00244513124100271, + "loss": 2.6371, + "mean_token_accuracy": 0.4589749872684479, + "num_tokens": 8531051619.0, + "step": 16688 + }, + { + "epoch": 4.512979989183342, + "grad_norm": 3.90625, + "learning_rate": 0.00244464171597598, + "loss": 2.6716, + "mean_token_accuracy": 0.46409302949905396, + "num_tokens": 8531537231.0, + "step": 16689 + }, + { + "epoch": 4.513250405624662, + "grad_norm": 4.125, + "learning_rate": 0.0024441524534527994, + "loss": 2.6912, + "mean_token_accuracy": 0.4552862048149109, + "num_tokens": 8532061354.0, + "step": 16690 + }, + { + "epoch": 4.513520822065981, + "grad_norm": 66.5, + "learning_rate": 0.0024436634534481835, + "loss": 4.1483, + "mean_token_accuracy": 0.32101553678512573, + "num_tokens": 8532528277.0, + "step": 16691 + }, + { + "epoch": 4.513791238507301, + "grad_norm": 7.34375, + "learning_rate": 0.0024431747159771354, + "loss": 2.8982, + "mean_token_accuracy": 0.44305694103240967, + "num_tokens": 8533011157.0, + "step": 16692 + }, + { + "epoch": 4.5140616549486205, + "grad_norm": 3.75, + "learning_rate": 0.002442686241054651, + "loss": 2.8574, + "mean_token_accuracy": 0.4423344135284424, + "num_tokens": 8533535414.0, + "step": 16693 + }, + { + "epoch": 4.514332071389941, + "grad_norm": 4.5625, + "learning_rate": 0.0024421980286957177, + "loss": 2.8954, + "mean_token_accuracy": 0.45238932967185974, + "num_tokens": 8534019680.0, + "step": 16694 + }, + { + "epoch": 4.51460248783126, + "grad_norm": 3.875, + "learning_rate": 0.0024417100789153154, + "loss": 2.8128, + "mean_token_accuracy": 0.4387561082839966, + "num_tokens": 8534540376.0, + "step": 16695 + }, + { + "epoch": 4.51487290427258, + "grad_norm": 3.765625, + "learning_rate": 0.0024412223917284165, + "loss": 2.8697, + "mean_token_accuracy": 0.4431798458099365, + "num_tokens": 8535064655.0, + "step": 16696 + }, + { + "epoch": 4.515143320713899, + "grad_norm": 4.375, + "learning_rate": 0.002440734967149985, + "loss": 2.5401, + "mean_token_accuracy": 0.45658302307128906, + "num_tokens": 8535588760.0, + "step": 16697 + }, + { + "epoch": 4.515413737155219, + "grad_norm": 3.21875, + "learning_rate": 0.002440247805194976, + "loss": 2.6351, + "mean_token_accuracy": 0.4483747184276581, + "num_tokens": 8536112956.0, + "step": 16698 + }, + { + "epoch": 4.515684153596538, + "grad_norm": 3.484375, + "learning_rate": 0.002439760905878336, + "loss": 2.8277, + "mean_token_accuracy": 0.45980992913246155, + "num_tokens": 8536588056.0, + "step": 16699 + }, + { + "epoch": 4.5159545700378585, + "grad_norm": 3.90625, + "learning_rate": 0.0024392742692150066, + "loss": 2.7366, + "mean_token_accuracy": 0.4525528848171234, + "num_tokens": 8537112293.0, + "step": 16700 + }, + { + "epoch": 4.516224986479178, + "grad_norm": 3.171875, + "learning_rate": 0.002438787895219916, + "loss": 2.8789, + "mean_token_accuracy": 0.4317462742328644, + "num_tokens": 8537636505.0, + "step": 16701 + }, + { + "epoch": 4.516495402920498, + "grad_norm": 3.015625, + "learning_rate": 0.0024383017839079923, + "loss": 2.7353, + "mean_token_accuracy": 0.4487777650356293, + "num_tokens": 8538123165.0, + "step": 16702 + }, + { + "epoch": 4.516765819361817, + "grad_norm": 3.25, + "learning_rate": 0.0024378159352941467, + "loss": 2.7415, + "mean_token_accuracy": 0.4381159543991089, + "num_tokens": 8538647301.0, + "step": 16703 + }, + { + "epoch": 4.517036235803137, + "grad_norm": 8.375, + "learning_rate": 0.002437330349393288, + "loss": 2.6027, + "mean_token_accuracy": 0.47033560276031494, + "num_tokens": 8539171508.0, + "step": 16704 + }, + { + "epoch": 4.517306652244456, + "grad_norm": 3.265625, + "learning_rate": 0.0024368450262203163, + "loss": 2.7063, + "mean_token_accuracy": 0.4388946294784546, + "num_tokens": 8539695761.0, + "step": 16705 + }, + { + "epoch": 4.517577068685776, + "grad_norm": 4.0625, + "learning_rate": 0.002436359965790121, + "loss": 2.8745, + "mean_token_accuracy": 0.4711862802505493, + "num_tokens": 8540155525.0, + "step": 16706 + }, + { + "epoch": 4.5178474851270956, + "grad_norm": 3.75, + "learning_rate": 0.002435875168117586, + "loss": 2.61, + "mean_token_accuracy": 0.44264331459999084, + "num_tokens": 8540679649.0, + "step": 16707 + }, + { + "epoch": 4.518117901568416, + "grad_norm": 3.34375, + "learning_rate": 0.0024353906332175864, + "loss": 2.8679, + "mean_token_accuracy": 0.42576101422309875, + "num_tokens": 8541203923.0, + "step": 16708 + }, + { + "epoch": 4.518388318009735, + "grad_norm": 3.4375, + "learning_rate": 0.002434906361104988, + "loss": 2.6825, + "mean_token_accuracy": 0.4227216839790344, + "num_tokens": 8541722728.0, + "step": 16709 + }, + { + "epoch": 4.518658734451055, + "grad_norm": 3.953125, + "learning_rate": 0.0024344223517946515, + "loss": 2.8199, + "mean_token_accuracy": 0.441101610660553, + "num_tokens": 8542219851.0, + "step": 16710 + }, + { + "epoch": 4.518929150892374, + "grad_norm": 18.625, + "learning_rate": 0.0024339386053014267, + "loss": 2.4061, + "mean_token_accuracy": 0.5183169841766357, + "num_tokens": 8542717955.0, + "step": 16711 + }, + { + "epoch": 4.519199567333694, + "grad_norm": 6.5, + "learning_rate": 0.0024334551216401577, + "loss": 2.8127, + "mean_token_accuracy": 0.44536343216896057, + "num_tokens": 8543242154.0, + "step": 16712 + }, + { + "epoch": 4.519469983775013, + "grad_norm": 3.40625, + "learning_rate": 0.0024329719008256766, + "loss": 2.6075, + "mean_token_accuracy": 0.46121370792388916, + "num_tokens": 8543766257.0, + "step": 16713 + }, + { + "epoch": 4.5197404002163335, + "grad_norm": 3.8125, + "learning_rate": 0.002432488942872814, + "loss": 2.9006, + "mean_token_accuracy": 0.4128083288669586, + "num_tokens": 8544290501.0, + "step": 16714 + }, + { + "epoch": 4.520010816657653, + "grad_norm": 3.703125, + "learning_rate": 0.0024320062477963846, + "loss": 2.6939, + "mean_token_accuracy": 0.4711224436759949, + "num_tokens": 8544814586.0, + "step": 16715 + }, + { + "epoch": 4.520281233098973, + "grad_norm": 3.96875, + "learning_rate": 0.0024315238156111993, + "loss": 2.6921, + "mean_token_accuracy": 0.4374806880950928, + "num_tokens": 8545338806.0, + "step": 16716 + }, + { + "epoch": 4.520551649540292, + "grad_norm": 3.546875, + "learning_rate": 0.0024310416463320626, + "loss": 2.7875, + "mean_token_accuracy": 0.4359892010688782, + "num_tokens": 8545863079.0, + "step": 16717 + }, + { + "epoch": 4.520822065981612, + "grad_norm": 4.21875, + "learning_rate": 0.0024305597399737686, + "loss": 2.6758, + "mean_token_accuracy": 0.45827633142471313, + "num_tokens": 8546387340.0, + "step": 16718 + }, + { + "epoch": 4.521092482422931, + "grad_norm": 5.0, + "learning_rate": 0.0024300780965511018, + "loss": 2.848, + "mean_token_accuracy": 0.4242643117904663, + "num_tokens": 8546911545.0, + "step": 16719 + }, + { + "epoch": 4.5213628988642505, + "grad_norm": 16.375, + "learning_rate": 0.0024295967160788416, + "loss": 2.6608, + "mean_token_accuracy": 0.462399423122406, + "num_tokens": 8547435720.0, + "step": 16720 + }, + { + "epoch": 4.5216333153055706, + "grad_norm": 3.53125, + "learning_rate": 0.00242911559857176, + "loss": 2.9377, + "mean_token_accuracy": 0.4229221045970917, + "num_tokens": 8547959953.0, + "step": 16721 + }, + { + "epoch": 4.521903731746891, + "grad_norm": 5.21875, + "learning_rate": 0.0024286347440446135, + "loss": 2.9333, + "mean_token_accuracy": 0.4379720091819763, + "num_tokens": 8548484102.0, + "step": 16722 + }, + { + "epoch": 4.52217414818821, + "grad_norm": 3.90625, + "learning_rate": 0.0024281541525121614, + "loss": 2.8172, + "mean_token_accuracy": 0.4269794821739197, + "num_tokens": 8549008244.0, + "step": 16723 + }, + { + "epoch": 4.522444564629529, + "grad_norm": 3.203125, + "learning_rate": 0.0024276738239891495, + "loss": 2.7948, + "mean_token_accuracy": 0.4510383605957031, + "num_tokens": 8549520356.0, + "step": 16724 + }, + { + "epoch": 4.522714981070849, + "grad_norm": 3.359375, + "learning_rate": 0.002427193758490312, + "loss": 2.5832, + "mean_token_accuracy": 0.45371830463409424, + "num_tokens": 8550044503.0, + "step": 16725 + }, + { + "epoch": 4.522985397512169, + "grad_norm": 3.359375, + "learning_rate": 0.002426713956030382, + "loss": 2.832, + "mean_token_accuracy": 0.4400443434715271, + "num_tokens": 8550568739.0, + "step": 16726 + }, + { + "epoch": 4.523255813953488, + "grad_norm": 4.125, + "learning_rate": 0.00242623441662408, + "loss": 2.7919, + "mean_token_accuracy": 0.44169071316719055, + "num_tokens": 8551092911.0, + "step": 16727 + }, + { + "epoch": 4.523526230394808, + "grad_norm": 3.453125, + "learning_rate": 0.0024257551402861203, + "loss": 2.8298, + "mean_token_accuracy": 0.43726062774658203, + "num_tokens": 8551617038.0, + "step": 16728 + }, + { + "epoch": 4.523796646836128, + "grad_norm": 3.65625, + "learning_rate": 0.0024252761270312085, + "loss": 2.7693, + "mean_token_accuracy": 0.4760468602180481, + "num_tokens": 8552079604.0, + "step": 16729 + }, + { + "epoch": 4.524067063277447, + "grad_norm": 3.296875, + "learning_rate": 0.002424797376874041, + "loss": 2.7652, + "mean_token_accuracy": 0.45634010434150696, + "num_tokens": 8552594997.0, + "step": 16730 + }, + { + "epoch": 4.524337479718767, + "grad_norm": 78.5, + "learning_rate": 0.0024243188898293087, + "loss": 5.3317, + "mean_token_accuracy": 0.2812918424606323, + "num_tokens": 8553119237.0, + "step": 16731 + }, + { + "epoch": 4.524607896160086, + "grad_norm": 6.21875, + "learning_rate": 0.002423840665911693, + "loss": 2.7869, + "mean_token_accuracy": 0.45044246315956116, + "num_tokens": 8553626104.0, + "step": 16732 + }, + { + "epoch": 4.524878312601406, + "grad_norm": 10.8125, + "learning_rate": 0.0024233627051358643, + "loss": 2.7013, + "mean_token_accuracy": 0.4519217610359192, + "num_tokens": 8554150242.0, + "step": 16733 + }, + { + "epoch": 4.5251487290427255, + "grad_norm": 3.953125, + "learning_rate": 0.0024228850075164914, + "loss": 2.8871, + "mean_token_accuracy": 0.43665528297424316, + "num_tokens": 8554662356.0, + "step": 16734 + }, + { + "epoch": 4.525419145484046, + "grad_norm": 3.515625, + "learning_rate": 0.00242240757306823, + "loss": 2.7587, + "mean_token_accuracy": 0.4555083215236664, + "num_tokens": 8555186624.0, + "step": 16735 + }, + { + "epoch": 4.525689561925365, + "grad_norm": 2.796875, + "learning_rate": 0.0024219304018057294, + "loss": 2.5719, + "mean_token_accuracy": 0.43949827551841736, + "num_tokens": 8555710891.0, + "step": 16736 + }, + { + "epoch": 4.525959978366685, + "grad_norm": 3.1875, + "learning_rate": 0.00242145349374363, + "loss": 2.6176, + "mean_token_accuracy": 0.4404256343841553, + "num_tokens": 8556235013.0, + "step": 16737 + }, + { + "epoch": 4.526230394808004, + "grad_norm": 3.40625, + "learning_rate": 0.002420976848896565, + "loss": 2.6364, + "mean_token_accuracy": 0.44484230875968933, + "num_tokens": 8556759085.0, + "step": 16738 + }, + { + "epoch": 4.526500811249324, + "grad_norm": 7.5, + "learning_rate": 0.0024205004672791593, + "loss": 2.6763, + "mean_token_accuracy": 0.4884543716907501, + "num_tokens": 8557211583.0, + "step": 16739 + }, + { + "epoch": 4.526771227690643, + "grad_norm": 4.46875, + "learning_rate": 0.0024200243489060315, + "loss": 3.014, + "mean_token_accuracy": 0.42774367332458496, + "num_tokens": 8557735800.0, + "step": 16740 + }, + { + "epoch": 4.527041644131963, + "grad_norm": 5.21875, + "learning_rate": 0.002419548493791788, + "loss": 2.7003, + "mean_token_accuracy": 0.46369045972824097, + "num_tokens": 8558216795.0, + "step": 16741 + }, + { + "epoch": 4.527312060573283, + "grad_norm": 4.0625, + "learning_rate": 0.0024190729019510295, + "loss": 2.7486, + "mean_token_accuracy": 0.42939117550849915, + "num_tokens": 8558725935.0, + "step": 16742 + }, + { + "epoch": 4.527582477014603, + "grad_norm": 3.28125, + "learning_rate": 0.0024185975733983503, + "loss": 2.8033, + "mean_token_accuracy": 0.45630645751953125, + "num_tokens": 8559250121.0, + "step": 16743 + }, + { + "epoch": 4.527852893455922, + "grad_norm": 3.828125, + "learning_rate": 0.0024181225081483346, + "loss": 2.7411, + "mean_token_accuracy": 0.44721347093582153, + "num_tokens": 8559728274.0, + "step": 16744 + }, + { + "epoch": 4.528123309897242, + "grad_norm": 3.34375, + "learning_rate": 0.002417647706215555, + "loss": 2.6588, + "mean_token_accuracy": 0.45761337876319885, + "num_tokens": 8560252544.0, + "step": 16745 + }, + { + "epoch": 4.528393726338561, + "grad_norm": 4.5, + "learning_rate": 0.0024171731676145857, + "loss": 2.8961, + "mean_token_accuracy": 0.4444162845611572, + "num_tokens": 8560776820.0, + "step": 16746 + }, + { + "epoch": 4.528664142779881, + "grad_norm": 3.90625, + "learning_rate": 0.0024166988923599825, + "loss": 2.9444, + "mean_token_accuracy": 0.42641112208366394, + "num_tokens": 8561263423.0, + "step": 16747 + }, + { + "epoch": 4.5289345592212005, + "grad_norm": 4.28125, + "learning_rate": 0.0024162248804663, + "loss": 2.96, + "mean_token_accuracy": 0.42341348528862, + "num_tokens": 8561787610.0, + "step": 16748 + }, + { + "epoch": 4.529204975662521, + "grad_norm": 3.703125, + "learning_rate": 0.002415751131948081, + "loss": 2.8396, + "mean_token_accuracy": 0.413453608751297, + "num_tokens": 8562311718.0, + "step": 16749 + }, + { + "epoch": 4.52947539210384, + "grad_norm": 3.671875, + "learning_rate": 0.002415277646819861, + "loss": 2.8975, + "mean_token_accuracy": 0.43881329894065857, + "num_tokens": 8562835932.0, + "step": 16750 + }, + { + "epoch": 4.52974580854516, + "grad_norm": 90.0, + "learning_rate": 0.0024148044250961705, + "loss": 4.4185, + "mean_token_accuracy": 0.27576202154159546, + "num_tokens": 8563297156.0, + "step": 16751 + }, + { + "epoch": 4.530016224986479, + "grad_norm": 9.9375, + "learning_rate": 0.002414331466791527, + "loss": 2.9962, + "mean_token_accuracy": 0.3978705406188965, + "num_tokens": 8563816091.0, + "step": 16752 + }, + { + "epoch": 4.530286641427799, + "grad_norm": 3.859375, + "learning_rate": 0.0024138587719204417, + "loss": 2.5565, + "mean_token_accuracy": 0.46073397994041443, + "num_tokens": 8564340351.0, + "step": 16753 + }, + { + "epoch": 4.530557057869118, + "grad_norm": 4.75, + "learning_rate": 0.0024133863404974206, + "loss": 2.9722, + "mean_token_accuracy": 0.4249444603919983, + "num_tokens": 8564864542.0, + "step": 16754 + }, + { + "epoch": 4.530827474310438, + "grad_norm": 4.1875, + "learning_rate": 0.0024129141725369595, + "loss": 2.6743, + "mean_token_accuracy": 0.4542234241962433, + "num_tokens": 8565388633.0, + "step": 16755 + }, + { + "epoch": 4.531097890751758, + "grad_norm": 3.765625, + "learning_rate": 0.0024124422680535424, + "loss": 2.7937, + "mean_token_accuracy": 0.456463485956192, + "num_tokens": 8565912806.0, + "step": 16756 + }, + { + "epoch": 4.531368307193078, + "grad_norm": 4.59375, + "learning_rate": 0.0024119706270616516, + "loss": 2.8276, + "mean_token_accuracy": 0.439300000667572, + "num_tokens": 8566436851.0, + "step": 16757 + }, + { + "epoch": 4.531638723634397, + "grad_norm": 4.0625, + "learning_rate": 0.0024114992495757586, + "loss": 2.7505, + "mean_token_accuracy": 0.46173667907714844, + "num_tokens": 8566945078.0, + "step": 16758 + }, + { + "epoch": 4.531909140075717, + "grad_norm": 3.640625, + "learning_rate": 0.0024110281356103233, + "loss": 2.62, + "mean_token_accuracy": 0.4499606788158417, + "num_tokens": 8567469359.0, + "step": 16759 + }, + { + "epoch": 4.532179556517036, + "grad_norm": 2.84375, + "learning_rate": 0.0024105572851798062, + "loss": 2.8382, + "mean_token_accuracy": 0.4481584131717682, + "num_tokens": 8567993539.0, + "step": 16760 + }, + { + "epoch": 4.532449972958355, + "grad_norm": 3.40625, + "learning_rate": 0.0024100866982986503, + "loss": 2.7237, + "mean_token_accuracy": 0.44393816590309143, + "num_tokens": 8568517599.0, + "step": 16761 + }, + { + "epoch": 4.5327203893996755, + "grad_norm": 27.5, + "learning_rate": 0.0024096163749812954, + "loss": 2.4381, + "mean_token_accuracy": 0.48552078008651733, + "num_tokens": 8569041774.0, + "step": 16762 + }, + { + "epoch": 4.532990805840996, + "grad_norm": 5.0, + "learning_rate": 0.002409146315242174, + "loss": 2.8203, + "mean_token_accuracy": 0.44294142723083496, + "num_tokens": 8569548176.0, + "step": 16763 + }, + { + "epoch": 4.533261222282315, + "grad_norm": 3.46875, + "learning_rate": 0.0024086765190957067, + "loss": 2.9978, + "mean_token_accuracy": 0.42141932249069214, + "num_tokens": 8570072439.0, + "step": 16764 + }, + { + "epoch": 4.533531638723634, + "grad_norm": 4.15625, + "learning_rate": 0.002408206986556309, + "loss": 2.7598, + "mean_token_accuracy": 0.4433484673500061, + "num_tokens": 8570575761.0, + "step": 16765 + }, + { + "epoch": 4.533802055164954, + "grad_norm": 10.5625, + "learning_rate": 0.0024077377176383867, + "loss": 2.4566, + "mean_token_accuracy": 0.4899126887321472, + "num_tokens": 8571080282.0, + "step": 16766 + }, + { + "epoch": 4.534072471606274, + "grad_norm": 4.15625, + "learning_rate": 0.0024072687123563416, + "loss": 2.8442, + "mean_token_accuracy": 0.4564414620399475, + "num_tokens": 8571557797.0, + "step": 16767 + }, + { + "epoch": 4.534342888047593, + "grad_norm": 3.765625, + "learning_rate": 0.0024067999707245605, + "loss": 2.8381, + "mean_token_accuracy": 0.42669978737831116, + "num_tokens": 8572082070.0, + "step": 16768 + }, + { + "epoch": 4.5346133044889125, + "grad_norm": 3.75, + "learning_rate": 0.002406331492757429, + "loss": 2.8742, + "mean_token_accuracy": 0.43666499853134155, + "num_tokens": 8572606265.0, + "step": 16769 + }, + { + "epoch": 4.534883720930233, + "grad_norm": 4.375, + "learning_rate": 0.0024058632784693173, + "loss": 2.7757, + "mean_token_accuracy": 0.4309576153755188, + "num_tokens": 8573130422.0, + "step": 16770 + }, + { + "epoch": 4.535154137371552, + "grad_norm": 35.0, + "learning_rate": 0.0024053953278745954, + "loss": 3.5398, + "mean_token_accuracy": 0.39005768299102783, + "num_tokens": 8573632245.0, + "step": 16771 + }, + { + "epoch": 4.535424553812872, + "grad_norm": 7.78125, + "learning_rate": 0.0024049276409876205, + "loss": 2.8515, + "mean_token_accuracy": 0.4191988706588745, + "num_tokens": 8574156398.0, + "step": 16772 + }, + { + "epoch": 4.535694970254191, + "grad_norm": 2.375, + "learning_rate": 0.002404460217822741, + "loss": 2.7325, + "mean_token_accuracy": 0.44664472341537476, + "num_tokens": 8574663908.0, + "step": 16773 + }, + { + "epoch": 4.535965386695511, + "grad_norm": 3.421875, + "learning_rate": 0.0024039930583943008, + "loss": 2.7864, + "mean_token_accuracy": 0.44482946395874023, + "num_tokens": 8575188191.0, + "step": 16774 + }, + { + "epoch": 4.53623580313683, + "grad_norm": 3.5, + "learning_rate": 0.0024035261627166327, + "loss": 2.692, + "mean_token_accuracy": 0.4513411521911621, + "num_tokens": 8575690768.0, + "step": 16775 + }, + { + "epoch": 4.5365062195781505, + "grad_norm": 3.15625, + "learning_rate": 0.002403059530804061, + "loss": 2.8235, + "mean_token_accuracy": 0.41640985012054443, + "num_tokens": 8576215054.0, + "step": 16776 + }, + { + "epoch": 4.53677663601947, + "grad_norm": 3.21875, + "learning_rate": 0.0024025931626709056, + "loss": 2.6007, + "mean_token_accuracy": 0.43723058700561523, + "num_tokens": 8576739172.0, + "step": 16777 + }, + { + "epoch": 4.53704705246079, + "grad_norm": 3.1875, + "learning_rate": 0.002402127058331477, + "loss": 2.6907, + "mean_token_accuracy": 0.4404897689819336, + "num_tokens": 8577263322.0, + "step": 16778 + }, + { + "epoch": 4.537317468902109, + "grad_norm": 3.453125, + "learning_rate": 0.0024016612178000725, + "loss": 2.8209, + "mean_token_accuracy": 0.4402239918708801, + "num_tokens": 8577776408.0, + "step": 16779 + }, + { + "epoch": 4.537587885343429, + "grad_norm": 3.5, + "learning_rate": 0.002401195641090988, + "loss": 2.8339, + "mean_token_accuracy": 0.4456830620765686, + "num_tokens": 8578247512.0, + "step": 16780 + }, + { + "epoch": 4.537858301784748, + "grad_norm": 3.40625, + "learning_rate": 0.00240073032821851, + "loss": 2.7499, + "mean_token_accuracy": 0.4408295154571533, + "num_tokens": 8578771776.0, + "step": 16781 + }, + { + "epoch": 4.538128718226068, + "grad_norm": 3.9375, + "learning_rate": 0.0024002652791969134, + "loss": 2.8298, + "mean_token_accuracy": 0.44462406635284424, + "num_tokens": 8579296048.0, + "step": 16782 + }, + { + "epoch": 4.5383991346673875, + "grad_norm": 3.890625, + "learning_rate": 0.002399800494040469, + "loss": 2.8385, + "mean_token_accuracy": 0.4119476079940796, + "num_tokens": 8579812481.0, + "step": 16783 + }, + { + "epoch": 4.538669551108708, + "grad_norm": 3.859375, + "learning_rate": 0.0023993359727634373, + "loss": 2.8042, + "mean_token_accuracy": 0.4385840892791748, + "num_tokens": 8580322301.0, + "step": 16784 + }, + { + "epoch": 4.538939967550027, + "grad_norm": 4.34375, + "learning_rate": 0.0023988717153800693, + "loss": 2.8028, + "mean_token_accuracy": 0.4688076674938202, + "num_tokens": 8580756896.0, + "step": 16785 + }, + { + "epoch": 4.539210383991347, + "grad_norm": 4.8125, + "learning_rate": 0.002398407721904613, + "loss": 2.7724, + "mean_token_accuracy": 0.4549323320388794, + "num_tokens": 8581281135.0, + "step": 16786 + }, + { + "epoch": 4.539480800432666, + "grad_norm": 3.609375, + "learning_rate": 0.0023979439923513027, + "loss": 2.7322, + "mean_token_accuracy": 0.4450102746486664, + "num_tokens": 8581805399.0, + "step": 16787 + }, + { + "epoch": 4.539751216873986, + "grad_norm": 3.78125, + "learning_rate": 0.0023974805267343675, + "loss": 2.7082, + "mean_token_accuracy": 0.4311809241771698, + "num_tokens": 8582329504.0, + "step": 16788 + }, + { + "epoch": 4.540021633315305, + "grad_norm": 3.515625, + "learning_rate": 0.0023970173250680285, + "loss": 2.8968, + "mean_token_accuracy": 0.42912498116493225, + "num_tokens": 8582853766.0, + "step": 16789 + }, + { + "epoch": 4.5402920497566255, + "grad_norm": 3.6875, + "learning_rate": 0.0023965543873664973, + "loss": 2.7361, + "mean_token_accuracy": 0.4377627372741699, + "num_tokens": 8583348879.0, + "step": 16790 + }, + { + "epoch": 4.540562466197945, + "grad_norm": 27.0, + "learning_rate": 0.002396091713643979, + "loss": 2.7333, + "mean_token_accuracy": 0.4542917311191559, + "num_tokens": 8583873127.0, + "step": 16791 + }, + { + "epoch": 4.540832882639265, + "grad_norm": 5.84375, + "learning_rate": 0.002395629303914669, + "loss": 2.9133, + "mean_token_accuracy": 0.4269337058067322, + "num_tokens": 8584397355.0, + "step": 16792 + }, + { + "epoch": 4.541103299080584, + "grad_norm": 3.390625, + "learning_rate": 0.0023951671581927555, + "loss": 2.8577, + "mean_token_accuracy": 0.4350918233394623, + "num_tokens": 8584921520.0, + "step": 16793 + }, + { + "epoch": 4.541373715521904, + "grad_norm": 4.15625, + "learning_rate": 0.0023947052764924192, + "loss": 2.8421, + "mean_token_accuracy": 0.43233633041381836, + "num_tokens": 8585445777.0, + "step": 16794 + }, + { + "epoch": 4.541644131963223, + "grad_norm": 3.546875, + "learning_rate": 0.0023942436588278307, + "loss": 2.7196, + "mean_token_accuracy": 0.457893043756485, + "num_tokens": 8585934728.0, + "step": 16795 + }, + { + "epoch": 4.541914548404543, + "grad_norm": 3.0, + "learning_rate": 0.002393782305213155, + "loss": 2.6117, + "mean_token_accuracy": 0.45036131143569946, + "num_tokens": 8586458917.0, + "step": 16796 + }, + { + "epoch": 4.5421849648458625, + "grad_norm": 3.25, + "learning_rate": 0.0023933212156625485, + "loss": 2.7161, + "mean_token_accuracy": 0.44539546966552734, + "num_tokens": 8586983168.0, + "step": 16797 + }, + { + "epoch": 4.542455381287183, + "grad_norm": 3.625, + "learning_rate": 0.0023928603901901572, + "loss": 2.7881, + "mean_token_accuracy": 0.45208677649497986, + "num_tokens": 8587507427.0, + "step": 16798 + }, + { + "epoch": 4.542725797728502, + "grad_norm": 4.0, + "learning_rate": 0.0023923998288101207, + "loss": 2.8174, + "mean_token_accuracy": 0.43764328956604004, + "num_tokens": 8587995804.0, + "step": 16799 + }, + { + "epoch": 4.542996214169822, + "grad_norm": 3.546875, + "learning_rate": 0.002391939531536571, + "loss": 2.8411, + "mean_token_accuracy": 0.44346320629119873, + "num_tokens": 8588519961.0, + "step": 16800 + }, + { + "epoch": 4.543266630611141, + "grad_norm": 4.0625, + "learning_rate": 0.0023914794983836314, + "loss": 2.8026, + "mean_token_accuracy": 0.4407924711704254, + "num_tokens": 8589044225.0, + "step": 16801 + }, + { + "epoch": 4.54353704705246, + "grad_norm": 3.765625, + "learning_rate": 0.002391019729365416, + "loss": 2.8524, + "mean_token_accuracy": 0.4259771704673767, + "num_tokens": 8589568389.0, + "step": 16802 + }, + { + "epoch": 4.54380746349378, + "grad_norm": 3.59375, + "learning_rate": 0.0023905602244960336, + "loss": 2.8268, + "mean_token_accuracy": 0.44691407680511475, + "num_tokens": 8590075800.0, + "step": 16803 + }, + { + "epoch": 4.5440778799351005, + "grad_norm": 5.875, + "learning_rate": 0.0023901009837895815, + "loss": 2.6846, + "mean_token_accuracy": 0.47144585847854614, + "num_tokens": 8590599961.0, + "step": 16804 + }, + { + "epoch": 4.54434829637642, + "grad_norm": 4.125, + "learning_rate": 0.0023896420072601512, + "loss": 2.6053, + "mean_token_accuracy": 0.4379013478755951, + "num_tokens": 8591124186.0, + "step": 16805 + }, + { + "epoch": 4.544618712817739, + "grad_norm": 3.703125, + "learning_rate": 0.002389183294921826, + "loss": 2.8948, + "mean_token_accuracy": 0.4395802617073059, + "num_tokens": 8591648327.0, + "step": 16806 + }, + { + "epoch": 4.544889129259059, + "grad_norm": 5.78125, + "learning_rate": 0.0023887248467886813, + "loss": 2.495, + "mean_token_accuracy": 0.4666231572628021, + "num_tokens": 8592172587.0, + "step": 16807 + }, + { + "epoch": 4.545159545700379, + "grad_norm": 2.46875, + "learning_rate": 0.0023882666628747794, + "loss": 2.7995, + "mean_token_accuracy": 0.45477449893951416, + "num_tokens": 8592641746.0, + "step": 16808 + }, + { + "epoch": 4.545429962141698, + "grad_norm": 4.125, + "learning_rate": 0.0023878087431941832, + "loss": 2.8606, + "mean_token_accuracy": 0.44766631722450256, + "num_tokens": 8593165909.0, + "step": 16809 + }, + { + "epoch": 4.5457003785830175, + "grad_norm": 3.3125, + "learning_rate": 0.0023873510877609424, + "loss": 2.7174, + "mean_token_accuracy": 0.4553542733192444, + "num_tokens": 8593627547.0, + "step": 16810 + }, + { + "epoch": 4.5459707950243375, + "grad_norm": 52.5, + "learning_rate": 0.0023868936965890965, + "loss": 2.843, + "mean_token_accuracy": 0.4562479853630066, + "num_tokens": 8594151706.0, + "step": 16811 + }, + { + "epoch": 4.546241211465657, + "grad_norm": 6.84375, + "learning_rate": 0.0023864365696926837, + "loss": 2.7339, + "mean_token_accuracy": 0.4337037205696106, + "num_tokens": 8594625944.0, + "step": 16812 + }, + { + "epoch": 4.546511627906977, + "grad_norm": 3.46875, + "learning_rate": 0.002385979707085727, + "loss": 2.7769, + "mean_token_accuracy": 0.4551014304161072, + "num_tokens": 8595150162.0, + "step": 16813 + }, + { + "epoch": 4.546782044348296, + "grad_norm": 3.578125, + "learning_rate": 0.002385523108782244, + "loss": 2.5955, + "mean_token_accuracy": 0.43765804171562195, + "num_tokens": 8595674358.0, + "step": 16814 + }, + { + "epoch": 4.547052460789616, + "grad_norm": 3.578125, + "learning_rate": 0.002385066774796248, + "loss": 2.9204, + "mean_token_accuracy": 0.4301888048648834, + "num_tokens": 8596177659.0, + "step": 16815 + }, + { + "epoch": 4.547322877230935, + "grad_norm": 3.015625, + "learning_rate": 0.002384610705141735, + "loss": 2.8333, + "mean_token_accuracy": 0.4326575696468353, + "num_tokens": 8596701878.0, + "step": 16816 + }, + { + "epoch": 4.547593293672255, + "grad_norm": 12.5625, + "learning_rate": 0.0023841548998327043, + "loss": 2.4736, + "mean_token_accuracy": 0.5059348940849304, + "num_tokens": 8597168839.0, + "step": 16817 + }, + { + "epoch": 4.547863710113575, + "grad_norm": 4.15625, + "learning_rate": 0.002383699358883138, + "loss": 2.8723, + "mean_token_accuracy": 0.44566941261291504, + "num_tokens": 8597678557.0, + "step": 16818 + }, + { + "epoch": 4.548134126554895, + "grad_norm": 3.515625, + "learning_rate": 0.0023832440823070133, + "loss": 2.7316, + "mean_token_accuracy": 0.44470900297164917, + "num_tokens": 8598202675.0, + "step": 16819 + }, + { + "epoch": 4.548404542996214, + "grad_norm": 3.3125, + "learning_rate": 0.0023827890701183024, + "loss": 2.7793, + "mean_token_accuracy": 0.4507443904876709, + "num_tokens": 8598726799.0, + "step": 16820 + }, + { + "epoch": 4.548674959437534, + "grad_norm": 3.921875, + "learning_rate": 0.002382334322330964, + "loss": 2.5892, + "mean_token_accuracy": 0.4565635323524475, + "num_tokens": 8599250868.0, + "step": 16821 + }, + { + "epoch": 4.548945375878853, + "grad_norm": 3.890625, + "learning_rate": 0.0023818798389589496, + "loss": 2.802, + "mean_token_accuracy": 0.4407624900341034, + "num_tokens": 8599769755.0, + "step": 16822 + }, + { + "epoch": 4.549215792320173, + "grad_norm": 3.65625, + "learning_rate": 0.0023814256200162063, + "loss": 2.7878, + "mean_token_accuracy": 0.43228521943092346, + "num_tokens": 8600293975.0, + "step": 16823 + }, + { + "epoch": 4.5494862087614925, + "grad_norm": 3.96875, + "learning_rate": 0.0023809716655166725, + "loss": 2.7382, + "mean_token_accuracy": 0.44345128536224365, + "num_tokens": 8600818116.0, + "step": 16824 + }, + { + "epoch": 4.5497566252028125, + "grad_norm": 4.09375, + "learning_rate": 0.002380517975474273, + "loss": 2.7246, + "mean_token_accuracy": 0.46435120701789856, + "num_tokens": 8601342342.0, + "step": 16825 + }, + { + "epoch": 4.550027041644132, + "grad_norm": 3.9375, + "learning_rate": 0.0023800645499029312, + "loss": 2.8621, + "mean_token_accuracy": 0.4279969036579132, + "num_tokens": 8601866567.0, + "step": 16826 + }, + { + "epoch": 4.550297458085452, + "grad_norm": 3.734375, + "learning_rate": 0.0023796113888165586, + "loss": 2.7372, + "mean_token_accuracy": 0.4279213845729828, + "num_tokens": 8602390796.0, + "step": 16827 + }, + { + "epoch": 4.550567874526771, + "grad_norm": 3.75, + "learning_rate": 0.002379158492229059, + "loss": 2.8025, + "mean_token_accuracy": 0.4605304002761841, + "num_tokens": 8602914974.0, + "step": 16828 + }, + { + "epoch": 4.550838290968091, + "grad_norm": 3.078125, + "learning_rate": 0.0023787058601543298, + "loss": 2.7252, + "mean_token_accuracy": 0.45910683274269104, + "num_tokens": 8603383480.0, + "step": 16829 + }, + { + "epoch": 4.55110870740941, + "grad_norm": 3.90625, + "learning_rate": 0.002378253492606258, + "loss": 2.8263, + "mean_token_accuracy": 0.44231611490249634, + "num_tokens": 8603907630.0, + "step": 16830 + }, + { + "epoch": 4.55137912385073, + "grad_norm": 78.5, + "learning_rate": 0.0023778013895987245, + "loss": 4.5142, + "mean_token_accuracy": 0.24974623322486877, + "num_tokens": 8604404639.0, + "step": 16831 + }, + { + "epoch": 4.55164954029205, + "grad_norm": 8.625, + "learning_rate": 0.0023773495511455996, + "loss": 2.6754, + "mean_token_accuracy": 0.4746134877204895, + "num_tokens": 8604928866.0, + "step": 16832 + }, + { + "epoch": 4.55191995673337, + "grad_norm": 3.453125, + "learning_rate": 0.002376897977260749, + "loss": 2.6466, + "mean_token_accuracy": 0.44421011209487915, + "num_tokens": 8605452959.0, + "step": 16833 + }, + { + "epoch": 4.552190373174689, + "grad_norm": 4.21875, + "learning_rate": 0.002376446667958027, + "loss": 2.8078, + "mean_token_accuracy": 0.435676634311676, + "num_tokens": 8605969719.0, + "step": 16834 + }, + { + "epoch": 4.552460789616009, + "grad_norm": 3.234375, + "learning_rate": 0.002375995623251282, + "loss": 2.8159, + "mean_token_accuracy": 0.44238635897636414, + "num_tokens": 8606493980.0, + "step": 16835 + }, + { + "epoch": 4.552731206057328, + "grad_norm": 4.15625, + "learning_rate": 0.002375544843154351, + "loss": 2.8178, + "mean_token_accuracy": 0.4441772699356079, + "num_tokens": 8607018237.0, + "step": 16836 + }, + { + "epoch": 4.553001622498648, + "grad_norm": 3.6875, + "learning_rate": 0.002375094327681069, + "loss": 2.89, + "mean_token_accuracy": 0.440647691488266, + "num_tokens": 8607542509.0, + "step": 16837 + }, + { + "epoch": 4.5532720389399675, + "grad_norm": 3.828125, + "learning_rate": 0.002374644076845256, + "loss": 2.5947, + "mean_token_accuracy": 0.4604651927947998, + "num_tokens": 8608066625.0, + "step": 16838 + }, + { + "epoch": 4.5535424553812875, + "grad_norm": 3.765625, + "learning_rate": 0.0023741940906607277, + "loss": 2.7083, + "mean_token_accuracy": 0.4474133849143982, + "num_tokens": 8608590784.0, + "step": 16839 + }, + { + "epoch": 4.553812871822607, + "grad_norm": 3.765625, + "learning_rate": 0.0023737443691412923, + "loss": 2.6927, + "mean_token_accuracy": 0.47556132078170776, + "num_tokens": 8609086777.0, + "step": 16840 + }, + { + "epoch": 4.554083288263927, + "grad_norm": 3.453125, + "learning_rate": 0.002373294912300749, + "loss": 2.8137, + "mean_token_accuracy": 0.447600781917572, + "num_tokens": 8609611015.0, + "step": 16841 + }, + { + "epoch": 4.554353704705246, + "grad_norm": 4.0625, + "learning_rate": 0.0023728457201528863, + "loss": 2.7482, + "mean_token_accuracy": 0.44122183322906494, + "num_tokens": 8610135204.0, + "step": 16842 + }, + { + "epoch": 4.554624121146565, + "grad_norm": 3.671875, + "learning_rate": 0.002372396792711487, + "loss": 2.7119, + "mean_token_accuracy": 0.43288111686706543, + "num_tokens": 8610659398.0, + "step": 16843 + }, + { + "epoch": 4.554894537587885, + "grad_norm": 4.6875, + "learning_rate": 0.002371948129990328, + "loss": 2.7782, + "mean_token_accuracy": 0.4078378677368164, + "num_tokens": 8611143362.0, + "step": 16844 + }, + { + "epoch": 4.555164954029205, + "grad_norm": 3.703125, + "learning_rate": 0.002371499732003172, + "loss": 2.8224, + "mean_token_accuracy": 0.4385431706905365, + "num_tokens": 8611667637.0, + "step": 16845 + }, + { + "epoch": 4.555435370470525, + "grad_norm": 4.40625, + "learning_rate": 0.00237105159876378, + "loss": 2.7573, + "mean_token_accuracy": 0.4351298213005066, + "num_tokens": 8612191768.0, + "step": 16846 + }, + { + "epoch": 4.555705786911844, + "grad_norm": 3.328125, + "learning_rate": 0.0023706037302859014, + "loss": 2.7112, + "mean_token_accuracy": 0.45188605785369873, + "num_tokens": 8612715974.0, + "step": 16847 + }, + { + "epoch": 4.555976203353164, + "grad_norm": 4.0625, + "learning_rate": 0.0023701561265832764, + "loss": 2.8819, + "mean_token_accuracy": 0.43044692277908325, + "num_tokens": 8613240203.0, + "step": 16848 + }, + { + "epoch": 4.556246619794484, + "grad_norm": 3.59375, + "learning_rate": 0.002369708787669642, + "loss": 2.6892, + "mean_token_accuracy": 0.44667279720306396, + "num_tokens": 8613764470.0, + "step": 16849 + }, + { + "epoch": 4.556517036235803, + "grad_norm": 3.578125, + "learning_rate": 0.00236926171355872, + "loss": 2.8248, + "mean_token_accuracy": 0.42661112546920776, + "num_tokens": 8614288745.0, + "step": 16850 + }, + { + "epoch": 4.556787452677122, + "grad_norm": 21.5, + "learning_rate": 0.002368814904264231, + "loss": 3.1775, + "mean_token_accuracy": 0.4536254405975342, + "num_tokens": 8614812776.0, + "step": 16851 + }, + { + "epoch": 4.5570578691184425, + "grad_norm": 7.90625, + "learning_rate": 0.0023683683597998826, + "loss": 2.7338, + "mean_token_accuracy": 0.4321795105934143, + "num_tokens": 8615337054.0, + "step": 16852 + }, + { + "epoch": 4.557328285559762, + "grad_norm": 4.125, + "learning_rate": 0.002367922080179378, + "loss": 2.7588, + "mean_token_accuracy": 0.4279268980026245, + "num_tokens": 8615861126.0, + "step": 16853 + }, + { + "epoch": 4.557598702001082, + "grad_norm": 3.578125, + "learning_rate": 0.0023674760654164083, + "loss": 2.7292, + "mean_token_accuracy": 0.44639158248901367, + "num_tokens": 8616385310.0, + "step": 16854 + }, + { + "epoch": 4.557869118442401, + "grad_norm": 3.5625, + "learning_rate": 0.0023670303155246607, + "loss": 2.7976, + "mean_token_accuracy": 0.43512454628944397, + "num_tokens": 8616909498.0, + "step": 16855 + }, + { + "epoch": 4.558139534883721, + "grad_norm": 5.8125, + "learning_rate": 0.0023665848305178113, + "loss": 2.7259, + "mean_token_accuracy": 0.44034382700920105, + "num_tokens": 8617388432.0, + "step": 16856 + }, + { + "epoch": 4.55840995132504, + "grad_norm": 3.546875, + "learning_rate": 0.002366139610409527, + "loss": 2.6142, + "mean_token_accuracy": 0.44043317437171936, + "num_tokens": 8617912663.0, + "step": 16857 + }, + { + "epoch": 4.55868036776636, + "grad_norm": 3.453125, + "learning_rate": 0.0023656946552134705, + "loss": 2.8894, + "mean_token_accuracy": 0.4326101541519165, + "num_tokens": 8618421444.0, + "step": 16858 + }, + { + "epoch": 4.5589507842076795, + "grad_norm": 3.375, + "learning_rate": 0.0023652499649432944, + "loss": 2.6862, + "mean_token_accuracy": 0.43753528594970703, + "num_tokens": 8618924455.0, + "step": 16859 + }, + { + "epoch": 4.559221200649, + "grad_norm": 4.0, + "learning_rate": 0.0023648055396126417, + "loss": 2.8054, + "mean_token_accuracy": 0.46758636832237244, + "num_tokens": 8619384200.0, + "step": 16860 + }, + { + "epoch": 4.559491617090319, + "grad_norm": 3.515625, + "learning_rate": 0.00236436137923515, + "loss": 2.6141, + "mean_token_accuracy": 0.4533064663410187, + "num_tokens": 8619850392.0, + "step": 16861 + }, + { + "epoch": 4.559762033531639, + "grad_norm": 3.03125, + "learning_rate": 0.0023639174838244463, + "loss": 2.7827, + "mean_token_accuracy": 0.446257621049881, + "num_tokens": 8620374669.0, + "step": 16862 + }, + { + "epoch": 4.560032449972958, + "grad_norm": 3.375, + "learning_rate": 0.002363473853394152, + "loss": 2.7861, + "mean_token_accuracy": 0.4725826680660248, + "num_tokens": 8620810179.0, + "step": 16863 + }, + { + "epoch": 4.560302866414278, + "grad_norm": 4.59375, + "learning_rate": 0.0023630304879578784, + "loss": 2.902, + "mean_token_accuracy": 0.4319244623184204, + "num_tokens": 8621334447.0, + "step": 16864 + }, + { + "epoch": 4.560573282855597, + "grad_norm": 4.21875, + "learning_rate": 0.0023625873875292277, + "loss": 2.7063, + "mean_token_accuracy": 0.45699524879455566, + "num_tokens": 8621858705.0, + "step": 16865 + }, + { + "epoch": 4.5608436992969175, + "grad_norm": 4.0625, + "learning_rate": 0.002362144552121797, + "loss": 2.9183, + "mean_token_accuracy": 0.44301915168762207, + "num_tokens": 8622372600.0, + "step": 16866 + }, + { + "epoch": 4.561114115738237, + "grad_norm": 3.84375, + "learning_rate": 0.0023617019817491753, + "loss": 2.5702, + "mean_token_accuracy": 0.462662011384964, + "num_tokens": 8622834276.0, + "step": 16867 + }, + { + "epoch": 4.561384532179557, + "grad_norm": 3.234375, + "learning_rate": 0.0023612596764249387, + "loss": 2.8157, + "mean_token_accuracy": 0.4307079017162323, + "num_tokens": 8623358539.0, + "step": 16868 + }, + { + "epoch": 4.561654948620876, + "grad_norm": 3.265625, + "learning_rate": 0.002360817636162662, + "loss": 2.6399, + "mean_token_accuracy": 0.45970287919044495, + "num_tokens": 8623829353.0, + "step": 16869 + }, + { + "epoch": 4.561925365062196, + "grad_norm": 3.171875, + "learning_rate": 0.0023603758609759056, + "loss": 2.6851, + "mean_token_accuracy": 0.46050602197647095, + "num_tokens": 8624353582.0, + "step": 16870 + }, + { + "epoch": 4.562195781503515, + "grad_norm": 182.0, + "learning_rate": 0.0023599343508782246, + "loss": 5.7879, + "mean_token_accuracy": 0.22640016674995422, + "num_tokens": 8624877751.0, + "step": 16871 + }, + { + "epoch": 4.562466197944835, + "grad_norm": 7.78125, + "learning_rate": 0.0023594931058831667, + "loss": 2.936, + "mean_token_accuracy": 0.4172465205192566, + "num_tokens": 8625401966.0, + "step": 16872 + }, + { + "epoch": 4.5627366143861545, + "grad_norm": 7.0625, + "learning_rate": 0.002359052126004272, + "loss": 2.4589, + "mean_token_accuracy": 0.48290854692459106, + "num_tokens": 8625926085.0, + "step": 16873 + }, + { + "epoch": 4.563007030827475, + "grad_norm": 4.46875, + "learning_rate": 0.0023586114112550674, + "loss": 2.8242, + "mean_token_accuracy": 0.4328264594078064, + "num_tokens": 8626449688.0, + "step": 16874 + }, + { + "epoch": 4.563277447268794, + "grad_norm": 4.09375, + "learning_rate": 0.002358170961649078, + "loss": 2.6983, + "mean_token_accuracy": 0.4711996018886566, + "num_tokens": 8626973959.0, + "step": 16875 + }, + { + "epoch": 4.563547863710114, + "grad_norm": 4.1875, + "learning_rate": 0.002357730777199818, + "loss": 2.8465, + "mean_token_accuracy": 0.4502559006214142, + "num_tokens": 8627458513.0, + "step": 16876 + }, + { + "epoch": 4.563818280151433, + "grad_norm": 3.828125, + "learning_rate": 0.0023572908579207923, + "loss": 2.74, + "mean_token_accuracy": 0.4386943578720093, + "num_tokens": 8627982783.0, + "step": 16877 + }, + { + "epoch": 4.564088696592753, + "grad_norm": 3.0625, + "learning_rate": 0.0023568512038255004, + "loss": 2.89, + "mean_token_accuracy": 0.4453280568122864, + "num_tokens": 8628507064.0, + "step": 16878 + }, + { + "epoch": 4.564359113034072, + "grad_norm": 3.59375, + "learning_rate": 0.0023564118149274303, + "loss": 2.7104, + "mean_token_accuracy": 0.4540298581123352, + "num_tokens": 8629031221.0, + "step": 16879 + }, + { + "epoch": 4.5646295294753925, + "grad_norm": 2.828125, + "learning_rate": 0.0023559726912400648, + "loss": 2.8398, + "mean_token_accuracy": 0.43168798089027405, + "num_tokens": 8629555431.0, + "step": 16880 + }, + { + "epoch": 4.564899945916712, + "grad_norm": 3.578125, + "learning_rate": 0.002355533832776879, + "loss": 2.9906, + "mean_token_accuracy": 0.43238383531570435, + "num_tokens": 8630079627.0, + "step": 16881 + }, + { + "epoch": 4.565170362358032, + "grad_norm": 3.671875, + "learning_rate": 0.002355095239551336, + "loss": 2.6257, + "mean_token_accuracy": 0.43597060441970825, + "num_tokens": 8630603891.0, + "step": 16882 + }, + { + "epoch": 4.565440778799351, + "grad_norm": 2.953125, + "learning_rate": 0.0023546569115768944, + "loss": 2.7135, + "mean_token_accuracy": 0.4518408179283142, + "num_tokens": 8631128038.0, + "step": 16883 + }, + { + "epoch": 4.56571119524067, + "grad_norm": 4.15625, + "learning_rate": 0.0023542188488670027, + "loss": 2.9028, + "mean_token_accuracy": 0.4476260542869568, + "num_tokens": 8631593120.0, + "step": 16884 + }, + { + "epoch": 4.56598161168199, + "grad_norm": 4.71875, + "learning_rate": 0.0023537810514351015, + "loss": 2.6099, + "mean_token_accuracy": 0.4378151297569275, + "num_tokens": 8632117252.0, + "step": 16885 + }, + { + "epoch": 4.56625202812331, + "grad_norm": 2.8125, + "learning_rate": 0.0023533435192946263, + "loss": 2.7078, + "mean_token_accuracy": 0.4736916422843933, + "num_tokens": 8632619062.0, + "step": 16886 + }, + { + "epoch": 4.5665224445646295, + "grad_norm": 3.875, + "learning_rate": 0.0023529062524589995, + "loss": 2.8161, + "mean_token_accuracy": 0.44461196660995483, + "num_tokens": 8633143347.0, + "step": 16887 + }, + { + "epoch": 4.566792861005949, + "grad_norm": 4.03125, + "learning_rate": 0.002352469250941638, + "loss": 2.8868, + "mean_token_accuracy": 0.421933114528656, + "num_tokens": 8633667462.0, + "step": 16888 + }, + { + "epoch": 4.567063277447269, + "grad_norm": 3.3125, + "learning_rate": 0.0023520325147559505, + "loss": 2.6946, + "mean_token_accuracy": 0.45926713943481445, + "num_tokens": 8634191734.0, + "step": 16889 + }, + { + "epoch": 4.567333693888589, + "grad_norm": 3.875, + "learning_rate": 0.0023515960439153384, + "loss": 2.9215, + "mean_token_accuracy": 0.4314025342464447, + "num_tokens": 8634715907.0, + "step": 16890 + }, + { + "epoch": 4.567604110329908, + "grad_norm": 31.625, + "learning_rate": 0.0023511598384331916, + "loss": 3.4764, + "mean_token_accuracy": 0.3488362729549408, + "num_tokens": 8635240167.0, + "step": 16891 + }, + { + "epoch": 4.567874526771227, + "grad_norm": 6.96875, + "learning_rate": 0.0023507238983228973, + "loss": 2.8008, + "mean_token_accuracy": 0.42486509680747986, + "num_tokens": 8635764192.0, + "step": 16892 + }, + { + "epoch": 4.568144943212547, + "grad_norm": 2.609375, + "learning_rate": 0.002350288223597827, + "loss": 2.7939, + "mean_token_accuracy": 0.4244823455810547, + "num_tokens": 8636288284.0, + "step": 16893 + }, + { + "epoch": 4.568415359653867, + "grad_norm": 6.8125, + "learning_rate": 0.002349852814271354, + "loss": 2.8469, + "mean_token_accuracy": 0.45633038878440857, + "num_tokens": 8636750365.0, + "step": 16894 + }, + { + "epoch": 4.568685776095187, + "grad_norm": 4.03125, + "learning_rate": 0.0023494176703568325, + "loss": 2.6678, + "mean_token_accuracy": 0.45608875155448914, + "num_tokens": 8637274611.0, + "step": 16895 + }, + { + "epoch": 4.568956192536506, + "grad_norm": 3.484375, + "learning_rate": 0.0023489827918676182, + "loss": 2.9657, + "mean_token_accuracy": 0.42989131808280945, + "num_tokens": 8637788542.0, + "step": 16896 + }, + { + "epoch": 4.569226608977826, + "grad_norm": 3.6875, + "learning_rate": 0.002348548178817052, + "loss": 2.7945, + "mean_token_accuracy": 0.45810091495513916, + "num_tokens": 8638267833.0, + "step": 16897 + }, + { + "epoch": 4.569497025419145, + "grad_norm": 6.46875, + "learning_rate": 0.002348113831218471, + "loss": 2.7291, + "mean_token_accuracy": 0.47347554564476013, + "num_tokens": 8638763688.0, + "step": 16898 + }, + { + "epoch": 4.569767441860465, + "grad_norm": 2.78125, + "learning_rate": 0.002347679749085201, + "loss": 2.9627, + "mean_token_accuracy": 0.42719748616218567, + "num_tokens": 8639287935.0, + "step": 16899 + }, + { + "epoch": 4.570037858301784, + "grad_norm": 2.890625, + "learning_rate": 0.0023472459324305607, + "loss": 2.886, + "mean_token_accuracy": 0.4371853470802307, + "num_tokens": 8639812216.0, + "step": 16900 + }, + { + "epoch": 4.5703082747431045, + "grad_norm": 4.15625, + "learning_rate": 0.002346812381267861, + "loss": 2.5716, + "mean_token_accuracy": 0.4544481635093689, + "num_tokens": 8640273170.0, + "step": 16901 + }, + { + "epoch": 4.570578691184424, + "grad_norm": 3.765625, + "learning_rate": 0.002346379095610406, + "loss": 2.7383, + "mean_token_accuracy": 0.4466084837913513, + "num_tokens": 8640797371.0, + "step": 16902 + }, + { + "epoch": 4.570849107625744, + "grad_norm": 3.71875, + "learning_rate": 0.0023459460754714874, + "loss": 2.7436, + "mean_token_accuracy": 0.45586758852005005, + "num_tokens": 8641274181.0, + "step": 16903 + }, + { + "epoch": 4.571119524067063, + "grad_norm": 4.3125, + "learning_rate": 0.002345513320864394, + "loss": 2.8718, + "mean_token_accuracy": 0.4234945774078369, + "num_tokens": 8641798426.0, + "step": 16904 + }, + { + "epoch": 4.571389940508383, + "grad_norm": 4.3125, + "learning_rate": 0.0023450808318024037, + "loss": 2.6408, + "mean_token_accuracy": 0.4668588936328888, + "num_tokens": 8642290133.0, + "step": 16905 + }, + { + "epoch": 4.571660356949702, + "grad_norm": 4.125, + "learning_rate": 0.0023446486082987846, + "loss": 2.6778, + "mean_token_accuracy": 0.4604283571243286, + "num_tokens": 8642814406.0, + "step": 16906 + }, + { + "epoch": 4.571930773391022, + "grad_norm": 3.953125, + "learning_rate": 0.0023442166503668013, + "loss": 2.7024, + "mean_token_accuracy": 0.4451861083507538, + "num_tokens": 8643338528.0, + "step": 16907 + }, + { + "epoch": 4.572201189832342, + "grad_norm": 4.8125, + "learning_rate": 0.002343784958019704, + "loss": 2.8355, + "mean_token_accuracy": 0.4353317618370056, + "num_tokens": 8643810407.0, + "step": 16908 + }, + { + "epoch": 4.572471606273662, + "grad_norm": 3.265625, + "learning_rate": 0.0023433535312707412, + "loss": 2.6999, + "mean_token_accuracy": 0.4630890488624573, + "num_tokens": 8644334677.0, + "step": 16909 + }, + { + "epoch": 4.572742022714981, + "grad_norm": 3.71875, + "learning_rate": 0.002342922370133151, + "loss": 2.8458, + "mean_token_accuracy": 0.4451797902584076, + "num_tokens": 8644858793.0, + "step": 16910 + }, + { + "epoch": 4.573012439156301, + "grad_norm": 37.75, + "learning_rate": 0.00234249147462016, + "loss": 2.9604, + "mean_token_accuracy": 0.4405880570411682, + "num_tokens": 8645383030.0, + "step": 16911 + }, + { + "epoch": 4.57328285559762, + "grad_norm": 8.4375, + "learning_rate": 0.002342060844744992, + "loss": 2.972, + "mean_token_accuracy": 0.41766059398651123, + "num_tokens": 8645907305.0, + "step": 16912 + }, + { + "epoch": 4.57355327203894, + "grad_norm": 2.59375, + "learning_rate": 0.002341630480520857, + "loss": 2.745, + "mean_token_accuracy": 0.45085927844047546, + "num_tokens": 8646431523.0, + "step": 16913 + }, + { + "epoch": 4.573823688480259, + "grad_norm": 3.65625, + "learning_rate": 0.002341200381960962, + "loss": 2.7142, + "mean_token_accuracy": 0.45254769921302795, + "num_tokens": 8646955775.0, + "step": 16914 + }, + { + "epoch": 4.5740941049215795, + "grad_norm": 3.796875, + "learning_rate": 0.0023407705490785043, + "loss": 2.5815, + "mean_token_accuracy": 0.4503266215324402, + "num_tokens": 8647479916.0, + "step": 16915 + }, + { + "epoch": 4.574364521362899, + "grad_norm": 3.90625, + "learning_rate": 0.0023403409818866707, + "loss": 2.7756, + "mean_token_accuracy": 0.4426129460334778, + "num_tokens": 8647959906.0, + "step": 16916 + }, + { + "epoch": 4.574634937804219, + "grad_norm": 3.3125, + "learning_rate": 0.0023399116803986407, + "loss": 2.8965, + "mean_token_accuracy": 0.44199520349502563, + "num_tokens": 8648484137.0, + "step": 16917 + }, + { + "epoch": 4.574905354245538, + "grad_norm": 4.21875, + "learning_rate": 0.00233948264462759, + "loss": 2.6931, + "mean_token_accuracy": 0.44714298844337463, + "num_tokens": 8648958506.0, + "step": 16918 + }, + { + "epoch": 4.575175770686858, + "grad_norm": 3.6875, + "learning_rate": 0.0023390538745866806, + "loss": 2.7309, + "mean_token_accuracy": 0.45310139656066895, + "num_tokens": 8649449871.0, + "step": 16919 + }, + { + "epoch": 4.575446187128177, + "grad_norm": 3.921875, + "learning_rate": 0.0023386253702890668, + "loss": 2.5171, + "mean_token_accuracy": 0.45369952917099, + "num_tokens": 8649974099.0, + "step": 16920 + }, + { + "epoch": 4.575716603569497, + "grad_norm": 2.75, + "learning_rate": 0.0023381971317478997, + "loss": 2.8813, + "mean_token_accuracy": 0.4473858177661896, + "num_tokens": 8650488821.0, + "step": 16921 + }, + { + "epoch": 4.575987020010817, + "grad_norm": 24.125, + "learning_rate": 0.002337769158976316, + "loss": 2.7963, + "mean_token_accuracy": 0.4703521728515625, + "num_tokens": 8650917486.0, + "step": 16922 + }, + { + "epoch": 4.576257436452137, + "grad_norm": 4.59375, + "learning_rate": 0.002337341451987448, + "loss": 2.62, + "mean_token_accuracy": 0.457661896944046, + "num_tokens": 8651441600.0, + "step": 16923 + }, + { + "epoch": 4.576527852893456, + "grad_norm": 3.1875, + "learning_rate": 0.0023369140107944216, + "loss": 2.8303, + "mean_token_accuracy": 0.4343756437301636, + "num_tokens": 8651965769.0, + "step": 16924 + }, + { + "epoch": 4.576798269334775, + "grad_norm": 4.09375, + "learning_rate": 0.002336486835410349, + "loss": 3.0053, + "mean_token_accuracy": 0.40790772438049316, + "num_tokens": 8652489937.0, + "step": 16925 + }, + { + "epoch": 4.577068685776095, + "grad_norm": 3.828125, + "learning_rate": 0.0023360599258483365, + "loss": 2.78, + "mean_token_accuracy": 0.4373815059661865, + "num_tokens": 8652991280.0, + "step": 16926 + }, + { + "epoch": 4.577339102217415, + "grad_norm": 4.625, + "learning_rate": 0.0023356332821214873, + "loss": 2.7722, + "mean_token_accuracy": 0.4256458580493927, + "num_tokens": 8653515531.0, + "step": 16927 + }, + { + "epoch": 4.5776095186587344, + "grad_norm": 3.8125, + "learning_rate": 0.0023352069042428874, + "loss": 2.8025, + "mean_token_accuracy": 0.4529437720775604, + "num_tokens": 8654039683.0, + "step": 16928 + }, + { + "epoch": 4.577879935100054, + "grad_norm": 4.53125, + "learning_rate": 0.0023347807922256213, + "loss": 2.9507, + "mean_token_accuracy": 0.420665979385376, + "num_tokens": 8654563961.0, + "step": 16929 + }, + { + "epoch": 4.578150351541374, + "grad_norm": 3.375, + "learning_rate": 0.0023343549460827636, + "loss": 2.8355, + "mean_token_accuracy": 0.447643518447876, + "num_tokens": 8655088141.0, + "step": 16930 + }, + { + "epoch": 4.578420767982694, + "grad_norm": 21.0, + "learning_rate": 0.002333929365827379, + "loss": 2.4971, + "mean_token_accuracy": 0.445720374584198, + "num_tokens": 8655612420.0, + "step": 16931 + }, + { + "epoch": 4.578691184424013, + "grad_norm": 9.5, + "learning_rate": 0.002333504051472528, + "loss": 2.9241, + "mean_token_accuracy": 0.43920278549194336, + "num_tokens": 8656136583.0, + "step": 16932 + }, + { + "epoch": 4.578961600865332, + "grad_norm": 2.703125, + "learning_rate": 0.0023330790030312606, + "loss": 2.7908, + "mean_token_accuracy": 0.43189358711242676, + "num_tokens": 8656660676.0, + "step": 16933 + }, + { + "epoch": 4.579232017306652, + "grad_norm": 4.46875, + "learning_rate": 0.0023326542205166145, + "loss": 2.548, + "mean_token_accuracy": 0.4778102934360504, + "num_tokens": 8657121391.0, + "step": 16934 + }, + { + "epoch": 4.5795024337479715, + "grad_norm": 3.609375, + "learning_rate": 0.0023322297039416267, + "loss": 2.7793, + "mean_token_accuracy": 0.43544989824295044, + "num_tokens": 8657645532.0, + "step": 16935 + }, + { + "epoch": 4.579772850189292, + "grad_norm": 3.875, + "learning_rate": 0.002331805453319322, + "loss": 2.7588, + "mean_token_accuracy": 0.42797404527664185, + "num_tokens": 8658169721.0, + "step": 16936 + }, + { + "epoch": 4.580043266630611, + "grad_norm": 4.28125, + "learning_rate": 0.002331381468662717, + "loss": 2.7931, + "mean_token_accuracy": 0.4383123219013214, + "num_tokens": 8658693923.0, + "step": 16937 + }, + { + "epoch": 4.580313683071931, + "grad_norm": 4.1875, + "learning_rate": 0.002330957749984822, + "loss": 2.8677, + "mean_token_accuracy": 0.42483657598495483, + "num_tokens": 8659166890.0, + "step": 16938 + }, + { + "epoch": 4.58058409951325, + "grad_norm": 3.46875, + "learning_rate": 0.002330534297298637, + "loss": 2.7385, + "mean_token_accuracy": 0.4521610140800476, + "num_tokens": 8659691161.0, + "step": 16939 + }, + { + "epoch": 4.58085451595457, + "grad_norm": 4.40625, + "learning_rate": 0.0023301111106171545, + "loss": 2.8337, + "mean_token_accuracy": 0.4448919892311096, + "num_tokens": 8660172270.0, + "step": 16940 + }, + { + "epoch": 4.581124932395889, + "grad_norm": 2.921875, + "learning_rate": 0.002329688189953361, + "loss": 2.6057, + "mean_token_accuracy": 0.454950749874115, + "num_tokens": 8660696508.0, + "step": 16941 + }, + { + "epoch": 4.5813953488372094, + "grad_norm": 3.765625, + "learning_rate": 0.002329265535320231, + "loss": 2.7029, + "mean_token_accuracy": 0.45049136877059937, + "num_tokens": 8661156050.0, + "step": 16942 + }, + { + "epoch": 4.581665765278529, + "grad_norm": 3.765625, + "learning_rate": 0.0023288431467307323, + "loss": 2.9066, + "mean_token_accuracy": 0.4195086658000946, + "num_tokens": 8661680193.0, + "step": 16943 + }, + { + "epoch": 4.581936181719849, + "grad_norm": 4.375, + "learning_rate": 0.002328421024197827, + "loss": 2.8312, + "mean_token_accuracy": 0.4278406500816345, + "num_tokens": 8662204317.0, + "step": 16944 + }, + { + "epoch": 4.582206598161168, + "grad_norm": 3.46875, + "learning_rate": 0.002327999167734466, + "loss": 2.7029, + "mean_token_accuracy": 0.4394451081752777, + "num_tokens": 8662722669.0, + "step": 16945 + }, + { + "epoch": 4.582477014602488, + "grad_norm": 3.640625, + "learning_rate": 0.0023275775773535933, + "loss": 2.8615, + "mean_token_accuracy": 0.4381197690963745, + "num_tokens": 8663202394.0, + "step": 16946 + }, + { + "epoch": 4.582747431043807, + "grad_norm": 4.125, + "learning_rate": 0.0023271562530681445, + "loss": 2.6617, + "mean_token_accuracy": 0.4493541419506073, + "num_tokens": 8663726636.0, + "step": 16947 + }, + { + "epoch": 4.583017847485127, + "grad_norm": 3.71875, + "learning_rate": 0.0023267351948910477, + "loss": 2.8465, + "mean_token_accuracy": 0.4406445026397705, + "num_tokens": 8664250877.0, + "step": 16948 + }, + { + "epoch": 4.5832882639264465, + "grad_norm": 3.453125, + "learning_rate": 0.00232631440283522, + "loss": 2.6056, + "mean_token_accuracy": 0.44475460052490234, + "num_tokens": 8664754348.0, + "step": 16949 + }, + { + "epoch": 4.583558680367767, + "grad_norm": 3.796875, + "learning_rate": 0.002325893876913574, + "loss": 2.6771, + "mean_token_accuracy": 0.45174920558929443, + "num_tokens": 8665278621.0, + "step": 16950 + }, + { + "epoch": 4.583829096809086, + "grad_norm": 40.75, + "learning_rate": 0.0023254736171390133, + "loss": 3.632, + "mean_token_accuracy": 0.4069860279560089, + "num_tokens": 8665802734.0, + "step": 16951 + }, + { + "epoch": 4.584099513250406, + "grad_norm": 8.3125, + "learning_rate": 0.002325053623524432, + "loss": 2.8998, + "mean_token_accuracy": 0.4367104172706604, + "num_tokens": 8666326936.0, + "step": 16952 + }, + { + "epoch": 4.584369929691725, + "grad_norm": 3.234375, + "learning_rate": 0.002324633896082718, + "loss": 2.8387, + "mean_token_accuracy": 0.4119122624397278, + "num_tokens": 8666851178.0, + "step": 16953 + }, + { + "epoch": 4.584640346133045, + "grad_norm": 3.25, + "learning_rate": 0.0023242144348267478, + "loss": 2.9546, + "mean_token_accuracy": 0.4382745623588562, + "num_tokens": 8667336476.0, + "step": 16954 + }, + { + "epoch": 4.584910762574364, + "grad_norm": 4.40625, + "learning_rate": 0.0023237952397693923, + "loss": 2.8152, + "mean_token_accuracy": 0.4437901973724365, + "num_tokens": 8667860738.0, + "step": 16955 + }, + { + "epoch": 4.5851811790156844, + "grad_norm": 3.5625, + "learning_rate": 0.002323376310923515, + "loss": 2.822, + "mean_token_accuracy": 0.4406074285507202, + "num_tokens": 8668384934.0, + "step": 16956 + }, + { + "epoch": 4.585451595457004, + "grad_norm": 3.90625, + "learning_rate": 0.0023229576483019673, + "loss": 2.764, + "mean_token_accuracy": 0.4440704584121704, + "num_tokens": 8668909195.0, + "step": 16957 + }, + { + "epoch": 4.585722011898324, + "grad_norm": 4.53125, + "learning_rate": 0.002322539251917597, + "loss": 2.5768, + "mean_token_accuracy": 0.46611320972442627, + "num_tokens": 8669387369.0, + "step": 16958 + }, + { + "epoch": 4.585992428339643, + "grad_norm": 4.0625, + "learning_rate": 0.0023221211217832427, + "loss": 2.5381, + "mean_token_accuracy": 0.45401376485824585, + "num_tokens": 8669911534.0, + "step": 16959 + }, + { + "epoch": 4.586262844780963, + "grad_norm": 3.984375, + "learning_rate": 0.0023217032579117298, + "loss": 2.9583, + "mean_token_accuracy": 0.43121498823165894, + "num_tokens": 8670435803.0, + "step": 16960 + }, + { + "epoch": 4.586533261222282, + "grad_norm": 3.4375, + "learning_rate": 0.0023212856603158838, + "loss": 2.807, + "mean_token_accuracy": 0.42050525546073914, + "num_tokens": 8670960057.0, + "step": 16961 + }, + { + "epoch": 4.586803677663602, + "grad_norm": 4.15625, + "learning_rate": 0.002320868329008516, + "loss": 2.668, + "mean_token_accuracy": 0.45261213183403015, + "num_tokens": 8671484232.0, + "step": 16962 + }, + { + "epoch": 4.5870740941049215, + "grad_norm": 4.34375, + "learning_rate": 0.002320451264002431, + "loss": 2.6935, + "mean_token_accuracy": 0.446824848651886, + "num_tokens": 8672008512.0, + "step": 16963 + }, + { + "epoch": 4.587344510546242, + "grad_norm": 3.28125, + "learning_rate": 0.002320034465310427, + "loss": 2.8937, + "mean_token_accuracy": 0.4438292384147644, + "num_tokens": 8672481351.0, + "step": 16964 + }, + { + "epoch": 4.587614926987561, + "grad_norm": 3.953125, + "learning_rate": 0.0023196179329452904, + "loss": 2.7002, + "mean_token_accuracy": 0.4403495788574219, + "num_tokens": 8673005585.0, + "step": 16965 + }, + { + "epoch": 4.58788534342888, + "grad_norm": 3.796875, + "learning_rate": 0.002319201666919803, + "loss": 2.6486, + "mean_token_accuracy": 0.4726172685623169, + "num_tokens": 8673468188.0, + "step": 16966 + }, + { + "epoch": 4.5881557598702, + "grad_norm": 4.09375, + "learning_rate": 0.002318785667246738, + "loss": 2.8449, + "mean_token_accuracy": 0.4372865557670593, + "num_tokens": 8673992402.0, + "step": 16967 + }, + { + "epoch": 4.58842617631152, + "grad_norm": 4.4375, + "learning_rate": 0.002318369933938859, + "loss": 2.7691, + "mean_token_accuracy": 0.45116907358169556, + "num_tokens": 8674516577.0, + "step": 16968 + }, + { + "epoch": 4.588696592752839, + "grad_norm": 3.5625, + "learning_rate": 0.0023179544670089195, + "loss": 2.6287, + "mean_token_accuracy": 0.4665040969848633, + "num_tokens": 8675040856.0, + "step": 16969 + }, + { + "epoch": 4.588967009194159, + "grad_norm": 4.25, + "learning_rate": 0.0023175392664696705, + "loss": 2.9359, + "mean_token_accuracy": 0.4301278591156006, + "num_tokens": 8675565035.0, + "step": 16970 + }, + { + "epoch": 4.589237425635479, + "grad_norm": 53.75, + "learning_rate": 0.002317124332333851, + "loss": 4.9138, + "mean_token_accuracy": 0.27522018551826477, + "num_tokens": 8676088963.0, + "step": 16971 + }, + { + "epoch": 4.589507842076799, + "grad_norm": 8.8125, + "learning_rate": 0.0023167096646141906, + "loss": 2.9301, + "mean_token_accuracy": 0.4482795000076294, + "num_tokens": 8676560782.0, + "step": 16972 + }, + { + "epoch": 4.589778258518118, + "grad_norm": 3.28125, + "learning_rate": 0.002316295263323414, + "loss": 2.7631, + "mean_token_accuracy": 0.45229798555374146, + "num_tokens": 8677023647.0, + "step": 16973 + }, + { + "epoch": 4.590048674959437, + "grad_norm": 3.75, + "learning_rate": 0.0023158811284742353, + "loss": 2.7802, + "mean_token_accuracy": 0.4351060390472412, + "num_tokens": 8677547645.0, + "step": 16974 + }, + { + "epoch": 4.590319091400757, + "grad_norm": 3.09375, + "learning_rate": 0.002315467260079362, + "loss": 2.7411, + "mean_token_accuracy": 0.4559425413608551, + "num_tokens": 8678071918.0, + "step": 16975 + }, + { + "epoch": 4.590589507842076, + "grad_norm": 4.5, + "learning_rate": 0.0023150536581514935, + "loss": 2.954, + "mean_token_accuracy": 0.4350205659866333, + "num_tokens": 8678535988.0, + "step": 16976 + }, + { + "epoch": 4.5908599242833965, + "grad_norm": 3.921875, + "learning_rate": 0.002314640322703319, + "loss": 2.5266, + "mean_token_accuracy": 0.45259472727775574, + "num_tokens": 8679060188.0, + "step": 16977 + }, + { + "epoch": 4.591130340724716, + "grad_norm": 3.140625, + "learning_rate": 0.002314227253747521, + "loss": 2.9042, + "mean_token_accuracy": 0.4391072392463684, + "num_tokens": 8679584377.0, + "step": 16978 + }, + { + "epoch": 4.591400757166036, + "grad_norm": 3.40625, + "learning_rate": 0.002313814451296774, + "loss": 2.5204, + "mean_token_accuracy": 0.455818235874176, + "num_tokens": 8680108555.0, + "step": 16979 + }, + { + "epoch": 4.591671173607355, + "grad_norm": 3.578125, + "learning_rate": 0.0023134019153637445, + "loss": 2.8293, + "mean_token_accuracy": 0.4294392168521881, + "num_tokens": 8680632824.0, + "step": 16980 + }, + { + "epoch": 4.591941590048675, + "grad_norm": 3.59375, + "learning_rate": 0.002312989645961089, + "loss": 2.7319, + "mean_token_accuracy": 0.4599231779575348, + "num_tokens": 8681157012.0, + "step": 16981 + }, + { + "epoch": 4.592212006489994, + "grad_norm": 3.21875, + "learning_rate": 0.0023125776431014594, + "loss": 2.7341, + "mean_token_accuracy": 0.4562760889530182, + "num_tokens": 8681615901.0, + "step": 16982 + }, + { + "epoch": 4.592482422931314, + "grad_norm": 4.40625, + "learning_rate": 0.002312165906797494, + "loss": 2.7822, + "mean_token_accuracy": 0.4557214379310608, + "num_tokens": 8682140168.0, + "step": 16983 + }, + { + "epoch": 4.592752839372634, + "grad_norm": 3.671875, + "learning_rate": 0.00231175443706183, + "loss": 2.6451, + "mean_token_accuracy": 0.4481160342693329, + "num_tokens": 8682664366.0, + "step": 16984 + }, + { + "epoch": 4.593023255813954, + "grad_norm": 3.96875, + "learning_rate": 0.0023113432339070894, + "loss": 2.8711, + "mean_token_accuracy": 0.45019271969795227, + "num_tokens": 8683029313.0, + "step": 16985 + }, + { + "epoch": 4.593293672255273, + "grad_norm": 3.84375, + "learning_rate": 0.002310932297345889, + "loss": 2.8141, + "mean_token_accuracy": 0.4501339793205261, + "num_tokens": 8683553372.0, + "step": 16986 + }, + { + "epoch": 4.593564088696593, + "grad_norm": 4.0625, + "learning_rate": 0.00231052162739084, + "loss": 2.8349, + "mean_token_accuracy": 0.4365132749080658, + "num_tokens": 8684077603.0, + "step": 16987 + }, + { + "epoch": 4.593834505137912, + "grad_norm": 3.921875, + "learning_rate": 0.002310111224054542, + "loss": 2.7436, + "mean_token_accuracy": 0.45329707860946655, + "num_tokens": 8684601868.0, + "step": 16988 + }, + { + "epoch": 4.594104921579232, + "grad_norm": 3.6875, + "learning_rate": 0.002309701087349585, + "loss": 2.6073, + "mean_token_accuracy": 0.4651584327220917, + "num_tokens": 8685125893.0, + "step": 16989 + }, + { + "epoch": 4.594375338020551, + "grad_norm": 4.03125, + "learning_rate": 0.0023092912172885567, + "loss": 2.8915, + "mean_token_accuracy": 0.4181802272796631, + "num_tokens": 8685650072.0, + "step": 16990 + }, + { + "epoch": 4.5946457544618715, + "grad_norm": 125.5, + "learning_rate": 0.0023088816138840318, + "loss": 2.9927, + "mean_token_accuracy": 0.4449864327907562, + "num_tokens": 8686174121.0, + "step": 16991 + }, + { + "epoch": 4.594916170903191, + "grad_norm": 7.3125, + "learning_rate": 0.0023084722771485764, + "loss": 2.8251, + "mean_token_accuracy": 0.4272097945213318, + "num_tokens": 8686698258.0, + "step": 16992 + }, + { + "epoch": 4.595186587344511, + "grad_norm": 2.46875, + "learning_rate": 0.0023080632070947515, + "loss": 2.8179, + "mean_token_accuracy": 0.4592517018318176, + "num_tokens": 8687179042.0, + "step": 16993 + }, + { + "epoch": 4.59545700378583, + "grad_norm": 4.875, + "learning_rate": 0.00230765440373511, + "loss": 2.9135, + "mean_token_accuracy": 0.4367861747741699, + "num_tokens": 8687703320.0, + "step": 16994 + }, + { + "epoch": 4.59572742022715, + "grad_norm": 3.328125, + "learning_rate": 0.002307245867082193, + "loss": 2.7725, + "mean_token_accuracy": 0.43269312381744385, + "num_tokens": 8688227592.0, + "step": 16995 + }, + { + "epoch": 4.595997836668469, + "grad_norm": 3.9375, + "learning_rate": 0.002306837597148537, + "loss": 2.7496, + "mean_token_accuracy": 0.4427676796913147, + "num_tokens": 8688751833.0, + "step": 16996 + }, + { + "epoch": 4.596268253109789, + "grad_norm": 4.9375, + "learning_rate": 0.0023064295939466692, + "loss": 2.779, + "mean_token_accuracy": 0.43121880292892456, + "num_tokens": 8689276090.0, + "step": 16997 + }, + { + "epoch": 4.596538669551109, + "grad_norm": 3.6875, + "learning_rate": 0.0023060218574891063, + "loss": 2.9259, + "mean_token_accuracy": 0.42391711473464966, + "num_tokens": 8689800238.0, + "step": 16998 + }, + { + "epoch": 4.596809085992429, + "grad_norm": 4.9375, + "learning_rate": 0.0023056143877883614, + "loss": 2.7839, + "mean_token_accuracy": 0.4865647256374359, + "num_tokens": 8690259862.0, + "step": 16999 + }, + { + "epoch": 4.597079502433748, + "grad_norm": 5.40625, + "learning_rate": 0.0023052071848569354, + "loss": 2.8529, + "mean_token_accuracy": 0.4544437527656555, + "num_tokens": 8690718882.0, + "step": 17000 + }, + { + "epoch": 4.597349918875068, + "grad_norm": 3.109375, + "learning_rate": 0.0023048002487073213, + "loss": 2.6881, + "mean_token_accuracy": 0.4381375312805176, + "num_tokens": 8691243103.0, + "step": 17001 + }, + { + "epoch": 4.597620335316387, + "grad_norm": 3.578125, + "learning_rate": 0.002304393579352008, + "loss": 2.8285, + "mean_token_accuracy": 0.45270538330078125, + "num_tokens": 8691760980.0, + "step": 17002 + }, + { + "epoch": 4.597890751757707, + "grad_norm": 4.21875, + "learning_rate": 0.0023039871768034707, + "loss": 2.7593, + "mean_token_accuracy": 0.45430004596710205, + "num_tokens": 8692285226.0, + "step": 17003 + }, + { + "epoch": 4.598161168199026, + "grad_norm": 4.0625, + "learning_rate": 0.0023035810410741812, + "loss": 2.7574, + "mean_token_accuracy": 0.44074323773384094, + "num_tokens": 8692809422.0, + "step": 17004 + }, + { + "epoch": 4.5984315846403465, + "grad_norm": 5.03125, + "learning_rate": 0.0023031751721766, + "loss": 2.9229, + "mean_token_accuracy": 0.4560001790523529, + "num_tokens": 8693171951.0, + "step": 17005 + }, + { + "epoch": 4.598702001081666, + "grad_norm": 4.1875, + "learning_rate": 0.0023027695701231784, + "loss": 2.8979, + "mean_token_accuracy": 0.4257171154022217, + "num_tokens": 8693696228.0, + "step": 17006 + }, + { + "epoch": 4.598972417522985, + "grad_norm": 3.5625, + "learning_rate": 0.002302364234926365, + "loss": 2.4787, + "mean_token_accuracy": 0.4349231421947479, + "num_tokens": 8694194780.0, + "step": 17007 + }, + { + "epoch": 4.599242833964305, + "grad_norm": 5.21875, + "learning_rate": 0.002301959166598593, + "loss": 2.5737, + "mean_token_accuracy": 0.4720726013183594, + "num_tokens": 8694718927.0, + "step": 17008 + }, + { + "epoch": 4.599513250405625, + "grad_norm": 3.46875, + "learning_rate": 0.0023015543651522937, + "loss": 2.7558, + "mean_token_accuracy": 0.4468969404697418, + "num_tokens": 8695243064.0, + "step": 17009 + }, + { + "epoch": 4.599783666846944, + "grad_norm": 4.3125, + "learning_rate": 0.002301149830599889, + "loss": 2.8826, + "mean_token_accuracy": 0.4327753186225891, + "num_tokens": 8695767339.0, + "step": 17010 + }, + { + "epoch": 4.6000540832882635, + "grad_norm": 9.625, + "learning_rate": 0.0023007455629537876, + "loss": 2.2341, + "mean_token_accuracy": 0.5406564474105835, + "num_tokens": 8696291576.0, + "step": 17011 + }, + { + "epoch": 4.600324499729584, + "grad_norm": 7.28125, + "learning_rate": 0.0023003415622263944, + "loss": 2.881, + "mean_token_accuracy": 0.46102046966552734, + "num_tokens": 8696716204.0, + "step": 17012 + }, + { + "epoch": 4.600594916170904, + "grad_norm": 2.875, + "learning_rate": 0.0022999378284301077, + "loss": 2.6184, + "mean_token_accuracy": 0.4525447189807892, + "num_tokens": 8697240364.0, + "step": 17013 + }, + { + "epoch": 4.600865332612223, + "grad_norm": 3.15625, + "learning_rate": 0.0022995343615773134, + "loss": 2.8742, + "mean_token_accuracy": 0.4334961175918579, + "num_tokens": 8697764525.0, + "step": 17014 + }, + { + "epoch": 4.601135749053542, + "grad_norm": 3.921875, + "learning_rate": 0.0022991311616803903, + "loss": 2.9868, + "mean_token_accuracy": 0.4110133945941925, + "num_tokens": 8698288756.0, + "step": 17015 + }, + { + "epoch": 4.601406165494862, + "grad_norm": 4.21875, + "learning_rate": 0.0022987282287517115, + "loss": 2.8223, + "mean_token_accuracy": 0.45016980171203613, + "num_tokens": 8698789219.0, + "step": 17016 + }, + { + "epoch": 4.601676581936181, + "grad_norm": 3.234375, + "learning_rate": 0.00229832556280364, + "loss": 2.8708, + "mean_token_accuracy": 0.4425536096096039, + "num_tokens": 8699313340.0, + "step": 17017 + }, + { + "epoch": 4.601946998377501, + "grad_norm": 3.9375, + "learning_rate": 0.002297923163848529, + "loss": 2.7269, + "mean_token_accuracy": 0.4583374857902527, + "num_tokens": 8699837419.0, + "step": 17018 + }, + { + "epoch": 4.602217414818821, + "grad_norm": 3.53125, + "learning_rate": 0.002297521031898727, + "loss": 2.6894, + "mean_token_accuracy": 0.4578956961631775, + "num_tokens": 8700361629.0, + "step": 17019 + }, + { + "epoch": 4.602487831260141, + "grad_norm": 3.375, + "learning_rate": 0.002297119166966572, + "loss": 2.7561, + "mean_token_accuracy": 0.44997042417526245, + "num_tokens": 8700865757.0, + "step": 17020 + }, + { + "epoch": 4.60275824770146, + "grad_norm": 3.75, + "learning_rate": 0.0022967175690643948, + "loss": 2.7149, + "mean_token_accuracy": 0.43930065631866455, + "num_tokens": 8701389963.0, + "step": 17021 + }, + { + "epoch": 4.60302866414278, + "grad_norm": 3.84375, + "learning_rate": 0.0022963162382045156, + "loss": 2.8947, + "mean_token_accuracy": 0.4465336799621582, + "num_tokens": 8701914195.0, + "step": 17022 + }, + { + "epoch": 4.603299080584099, + "grad_norm": 3.8125, + "learning_rate": 0.0022959151743992524, + "loss": 2.8213, + "mean_token_accuracy": 0.444367378950119, + "num_tokens": 8702378925.0, + "step": 17023 + }, + { + "epoch": 4.603569497025419, + "grad_norm": 3.25, + "learning_rate": 0.0022955143776609063, + "loss": 2.9686, + "mean_token_accuracy": 0.43338215351104736, + "num_tokens": 8702881485.0, + "step": 17024 + }, + { + "epoch": 4.6038399134667385, + "grad_norm": 4.34375, + "learning_rate": 0.002295113848001779, + "loss": 2.8179, + "mean_token_accuracy": 0.41084253787994385, + "num_tokens": 8703405708.0, + "step": 17025 + }, + { + "epoch": 4.604110329908059, + "grad_norm": 3.765625, + "learning_rate": 0.002294713585434158, + "loss": 2.7658, + "mean_token_accuracy": 0.4421747326850891, + "num_tokens": 8703929957.0, + "step": 17026 + }, + { + "epoch": 4.604380746349378, + "grad_norm": 3.671875, + "learning_rate": 0.002294313589970325, + "loss": 2.8239, + "mean_token_accuracy": 0.44666895270347595, + "num_tokens": 8704454235.0, + "step": 17027 + }, + { + "epoch": 4.604651162790698, + "grad_norm": 4.53125, + "learning_rate": 0.002293913861622553, + "loss": 2.9763, + "mean_token_accuracy": 0.4111385643482208, + "num_tokens": 8704978487.0, + "step": 17028 + }, + { + "epoch": 4.604921579232017, + "grad_norm": 3.125, + "learning_rate": 0.0022935144004031056, + "loss": 2.8408, + "mean_token_accuracy": 0.4373057186603546, + "num_tokens": 8705502689.0, + "step": 17029 + }, + { + "epoch": 4.605191995673337, + "grad_norm": 5.21875, + "learning_rate": 0.0022931152063242416, + "loss": 2.736, + "mean_token_accuracy": 0.46557992696762085, + "num_tokens": 8706026945.0, + "step": 17030 + }, + { + "epoch": 4.605462412114656, + "grad_norm": 26.625, + "learning_rate": 0.0022927162793982075, + "loss": 3.3475, + "mean_token_accuracy": 0.3930971920490265, + "num_tokens": 8706551220.0, + "step": 17031 + }, + { + "epoch": 4.605732828555976, + "grad_norm": 9.4375, + "learning_rate": 0.0022923176196372446, + "loss": 2.9174, + "mean_token_accuracy": 0.4432794153690338, + "num_tokens": 8707075453.0, + "step": 17032 + }, + { + "epoch": 4.606003244997296, + "grad_norm": 3.96875, + "learning_rate": 0.002291919227053585, + "loss": 2.6961, + "mean_token_accuracy": 0.4552730917930603, + "num_tokens": 8707587246.0, + "step": 17033 + }, + { + "epoch": 4.606273661438616, + "grad_norm": 3.828125, + "learning_rate": 0.002291521101659453, + "loss": 2.7873, + "mean_token_accuracy": 0.4594578742980957, + "num_tokens": 8708111462.0, + "step": 17034 + }, + { + "epoch": 4.606544077879935, + "grad_norm": 3.46875, + "learning_rate": 0.0022911232434670616, + "loss": 2.8018, + "mean_token_accuracy": 0.43460777401924133, + "num_tokens": 8708635737.0, + "step": 17035 + }, + { + "epoch": 4.606814494321255, + "grad_norm": 3.453125, + "learning_rate": 0.0022907256524886213, + "loss": 2.7878, + "mean_token_accuracy": 0.43991437554359436, + "num_tokens": 8709159995.0, + "step": 17036 + }, + { + "epoch": 4.607084910762574, + "grad_norm": 3.5, + "learning_rate": 0.00229032832873633, + "loss": 2.7326, + "mean_token_accuracy": 0.4493612051010132, + "num_tokens": 8709684274.0, + "step": 17037 + }, + { + "epoch": 4.607355327203894, + "grad_norm": 3.40625, + "learning_rate": 0.0022899312722223794, + "loss": 2.8371, + "mean_token_accuracy": 0.4555911123752594, + "num_tokens": 8710154154.0, + "step": 17038 + }, + { + "epoch": 4.6076257436452135, + "grad_norm": 3.671875, + "learning_rate": 0.0022895344829589525, + "loss": 2.7207, + "mean_token_accuracy": 0.46785157918930054, + "num_tokens": 8710678356.0, + "step": 17039 + }, + { + "epoch": 4.607896160086534, + "grad_norm": 4.5, + "learning_rate": 0.0022891379609582237, + "loss": 2.8414, + "mean_token_accuracy": 0.45624423027038574, + "num_tokens": 8711202521.0, + "step": 17040 + }, + { + "epoch": 4.608166576527853, + "grad_norm": 4.8125, + "learning_rate": 0.0022887417062323577, + "loss": 3.0021, + "mean_token_accuracy": 0.4146861732006073, + "num_tokens": 8711726760.0, + "step": 17041 + }, + { + "epoch": 4.608436992969173, + "grad_norm": 4.1875, + "learning_rate": 0.0022883457187935154, + "loss": 2.931, + "mean_token_accuracy": 0.43873369693756104, + "num_tokens": 8712250967.0, + "step": 17042 + }, + { + "epoch": 4.608707409410492, + "grad_norm": 4.59375, + "learning_rate": 0.002287949998653846, + "loss": 2.8471, + "mean_token_accuracy": 0.4320746958255768, + "num_tokens": 8712775207.0, + "step": 17043 + }, + { + "epoch": 4.608977825851812, + "grad_norm": 3.96875, + "learning_rate": 0.0022875545458254896, + "loss": 2.7806, + "mean_token_accuracy": 0.44192662835121155, + "num_tokens": 8713299336.0, + "step": 17044 + }, + { + "epoch": 4.609248242293131, + "grad_norm": 4.0, + "learning_rate": 0.0022871593603205832, + "loss": 2.7797, + "mean_token_accuracy": 0.43714457750320435, + "num_tokens": 8713823512.0, + "step": 17045 + }, + { + "epoch": 4.609518658734451, + "grad_norm": 4.125, + "learning_rate": 0.0022867644421512502, + "loss": 2.6083, + "mean_token_accuracy": 0.44868364930152893, + "num_tokens": 8714347697.0, + "step": 17046 + }, + { + "epoch": 4.609789075175771, + "grad_norm": 3.6875, + "learning_rate": 0.0022863697913296068, + "loss": 2.6268, + "mean_token_accuracy": 0.4405662417411804, + "num_tokens": 8714854381.0, + "step": 17047 + }, + { + "epoch": 4.61005949161709, + "grad_norm": 3.578125, + "learning_rate": 0.0022859754078677634, + "loss": 2.5612, + "mean_token_accuracy": 0.4697383940219879, + "num_tokens": 8715378536.0, + "step": 17048 + }, + { + "epoch": 4.61032990805841, + "grad_norm": 3.703125, + "learning_rate": 0.002285581291777821, + "loss": 2.6486, + "mean_token_accuracy": 0.43740683794021606, + "num_tokens": 8715902792.0, + "step": 17049 + }, + { + "epoch": 4.61060032449973, + "grad_norm": 3.625, + "learning_rate": 0.0022851874430718727, + "loss": 2.8194, + "mean_token_accuracy": 0.4204252362251282, + "num_tokens": 8716426367.0, + "step": 17050 + }, + { + "epoch": 4.610870740941049, + "grad_norm": 34.25, + "learning_rate": 0.0022847938617620014, + "loss": 2.7603, + "mean_token_accuracy": 0.4578864574432373, + "num_tokens": 8716950584.0, + "step": 17051 + }, + { + "epoch": 4.611141157382368, + "grad_norm": 7.4375, + "learning_rate": 0.002284400547860284, + "loss": 2.7775, + "mean_token_accuracy": 0.4566216468811035, + "num_tokens": 8717413704.0, + "step": 17052 + }, + { + "epoch": 4.6114115738236885, + "grad_norm": 2.4375, + "learning_rate": 0.0022840075013787903, + "loss": 2.9089, + "mean_token_accuracy": 0.43186303973197937, + "num_tokens": 8717933567.0, + "step": 17053 + }, + { + "epoch": 4.611681990265009, + "grad_norm": 4.25, + "learning_rate": 0.002283614722329578, + "loss": 2.498, + "mean_token_accuracy": 0.46952545642852783, + "num_tokens": 8718398140.0, + "step": 17054 + }, + { + "epoch": 4.611952406706328, + "grad_norm": 3.46875, + "learning_rate": 0.0022832222107246982, + "loss": 2.7398, + "mean_token_accuracy": 0.4581570625305176, + "num_tokens": 8718839941.0, + "step": 17055 + }, + { + "epoch": 4.612222823147647, + "grad_norm": 3.234375, + "learning_rate": 0.002282829966576196, + "loss": 2.8595, + "mean_token_accuracy": 0.4312295913696289, + "num_tokens": 8719364204.0, + "step": 17056 + }, + { + "epoch": 4.612493239588967, + "grad_norm": 3.890625, + "learning_rate": 0.002282437989896107, + "loss": 2.7701, + "mean_token_accuracy": 0.4369082748889923, + "num_tokens": 8719888381.0, + "step": 17057 + }, + { + "epoch": 4.612763656030286, + "grad_norm": 4.1875, + "learning_rate": 0.0022820462806964555, + "loss": 2.5528, + "mean_token_accuracy": 0.4679972529411316, + "num_tokens": 8720387992.0, + "step": 17058 + }, + { + "epoch": 4.613034072471606, + "grad_norm": 4.21875, + "learning_rate": 0.0022816548389892623, + "loss": 2.7337, + "mean_token_accuracy": 0.44184499979019165, + "num_tokens": 8720860865.0, + "step": 17059 + }, + { + "epoch": 4.6133044889129255, + "grad_norm": 3.96875, + "learning_rate": 0.0022812636647865383, + "loss": 2.8528, + "mean_token_accuracy": 0.42204397916793823, + "num_tokens": 8721385110.0, + "step": 17060 + }, + { + "epoch": 4.613574905354246, + "grad_norm": 4.15625, + "learning_rate": 0.0022808727581002837, + "loss": 2.8121, + "mean_token_accuracy": 0.44026899337768555, + "num_tokens": 8721909281.0, + "step": 17061 + }, + { + "epoch": 4.613845321795565, + "grad_norm": 4.15625, + "learning_rate": 0.0022804821189424955, + "loss": 2.6676, + "mean_token_accuracy": 0.4584483504295349, + "num_tokens": 8722433542.0, + "step": 17062 + }, + { + "epoch": 4.614115738236885, + "grad_norm": 3.421875, + "learning_rate": 0.002280091747325158, + "loss": 3.0162, + "mean_token_accuracy": 0.4181806147098541, + "num_tokens": 8722957822.0, + "step": 17063 + }, + { + "epoch": 4.614386154678204, + "grad_norm": 3.6875, + "learning_rate": 0.00227970164326025, + "loss": 2.6208, + "mean_token_accuracy": 0.4659065008163452, + "num_tokens": 8723482024.0, + "step": 17064 + }, + { + "epoch": 4.614656571119524, + "grad_norm": 4.53125, + "learning_rate": 0.002279311806759738, + "loss": 2.5343, + "mean_token_accuracy": 0.4675291180610657, + "num_tokens": 8723961692.0, + "step": 17065 + }, + { + "epoch": 4.614926987560843, + "grad_norm": 3.890625, + "learning_rate": 0.0022789222378355885, + "loss": 3.0072, + "mean_token_accuracy": 0.41620969772338867, + "num_tokens": 8724485914.0, + "step": 17066 + }, + { + "epoch": 4.6151974040021635, + "grad_norm": 4.09375, + "learning_rate": 0.00227853293649975, + "loss": 2.7772, + "mean_token_accuracy": 0.4543894827365875, + "num_tokens": 8724931264.0, + "step": 17067 + }, + { + "epoch": 4.615467820443483, + "grad_norm": 3.40625, + "learning_rate": 0.0022781439027641697, + "loss": 2.6781, + "mean_token_accuracy": 0.46751639246940613, + "num_tokens": 8725415935.0, + "step": 17068 + }, + { + "epoch": 4.615738236884803, + "grad_norm": 3.859375, + "learning_rate": 0.002277755136640784, + "loss": 2.827, + "mean_token_accuracy": 0.43784815073013306, + "num_tokens": 8725940215.0, + "step": 17069 + }, + { + "epoch": 4.616008653326122, + "grad_norm": 4.03125, + "learning_rate": 0.00227736663814152, + "loss": 2.7644, + "mean_token_accuracy": 0.45294246077537537, + "num_tokens": 8726381506.0, + "step": 17070 + }, + { + "epoch": 4.616279069767442, + "grad_norm": 9.4375, + "learning_rate": 0.0022769784072783007, + "loss": 2.397, + "mean_token_accuracy": 0.48839396238327026, + "num_tokens": 8726902495.0, + "step": 17071 + }, + { + "epoch": 4.616549486208761, + "grad_norm": 9.5625, + "learning_rate": 0.0022765904440630357, + "loss": 2.6828, + "mean_token_accuracy": 0.4602978229522705, + "num_tokens": 8727426720.0, + "step": 17072 + }, + { + "epoch": 4.616819902650081, + "grad_norm": 6.53125, + "learning_rate": 0.0022762027485076306, + "loss": 2.2288, + "mean_token_accuracy": 0.5441524982452393, + "num_tokens": 8727951001.0, + "step": 17073 + }, + { + "epoch": 4.6170903190914006, + "grad_norm": 3.859375, + "learning_rate": 0.0022758153206239797, + "loss": 2.8332, + "mean_token_accuracy": 0.44649675488471985, + "num_tokens": 8728379767.0, + "step": 17074 + }, + { + "epoch": 4.617360735532721, + "grad_norm": 3.390625, + "learning_rate": 0.0022754281604239705, + "loss": 2.6369, + "mean_token_accuracy": 0.4505082666873932, + "num_tokens": 8728839655.0, + "step": 17075 + }, + { + "epoch": 4.61763115197404, + "grad_norm": 3.515625, + "learning_rate": 0.002275041267919483, + "loss": 2.9778, + "mean_token_accuracy": 0.42852428555488586, + "num_tokens": 8729363806.0, + "step": 17076 + }, + { + "epoch": 4.61790156841536, + "grad_norm": 3.734375, + "learning_rate": 0.0022746546431223888, + "loss": 2.7538, + "mean_token_accuracy": 0.4645479917526245, + "num_tokens": 8729822605.0, + "step": 17077 + }, + { + "epoch": 4.618171984856679, + "grad_norm": 3.21875, + "learning_rate": 0.002274268286044548, + "loss": 2.8351, + "mean_token_accuracy": 0.43008238077163696, + "num_tokens": 8730346856.0, + "step": 17078 + }, + { + "epoch": 4.618442401297999, + "grad_norm": 4.25, + "learning_rate": 0.0022738821966978165, + "loss": 2.5943, + "mean_token_accuracy": 0.4615136981010437, + "num_tokens": 8730818998.0, + "step": 17079 + }, + { + "epoch": 4.618712817739318, + "grad_norm": 3.140625, + "learning_rate": 0.002273496375094043, + "loss": 2.9792, + "mean_token_accuracy": 0.42042654752731323, + "num_tokens": 8731343202.0, + "step": 17080 + }, + { + "epoch": 4.6189832341806385, + "grad_norm": 4.15625, + "learning_rate": 0.002273110821245062, + "loss": 2.6527, + "mean_token_accuracy": 0.46595558524131775, + "num_tokens": 8731817968.0, + "step": 17081 + }, + { + "epoch": 4.619253650621958, + "grad_norm": 3.359375, + "learning_rate": 0.0022727255351627064, + "loss": 2.5862, + "mean_token_accuracy": 0.46768707036972046, + "num_tokens": 8732342171.0, + "step": 17082 + }, + { + "epoch": 4.619524067063278, + "grad_norm": 3.703125, + "learning_rate": 0.0022723405168587965, + "loss": 2.7739, + "mean_token_accuracy": 0.4433317184448242, + "num_tokens": 8732866405.0, + "step": 17083 + }, + { + "epoch": 4.619794483504597, + "grad_norm": 3.734375, + "learning_rate": 0.0022719557663451444, + "loss": 2.6467, + "mean_token_accuracy": 0.43520283699035645, + "num_tokens": 8733390241.0, + "step": 17084 + }, + { + "epoch": 4.620064899945917, + "grad_norm": 4.1875, + "learning_rate": 0.0022715712836335594, + "loss": 2.6707, + "mean_token_accuracy": 0.42037147283554077, + "num_tokens": 8733914439.0, + "step": 17085 + }, + { + "epoch": 4.620335316387236, + "grad_norm": 3.1875, + "learning_rate": 0.0022711870687358348, + "loss": 2.864, + "mean_token_accuracy": 0.4276658296585083, + "num_tokens": 8734438619.0, + "step": 17086 + }, + { + "epoch": 4.620605732828556, + "grad_norm": 3.71875, + "learning_rate": 0.0022708031216637594, + "loss": 2.6457, + "mean_token_accuracy": 0.46465402841567993, + "num_tokens": 8734914267.0, + "step": 17087 + }, + { + "epoch": 4.6208761492698756, + "grad_norm": 3.921875, + "learning_rate": 0.0022704194424291164, + "loss": 2.8379, + "mean_token_accuracy": 0.4506823420524597, + "num_tokens": 8735409399.0, + "step": 17088 + }, + { + "epoch": 4.621146565711195, + "grad_norm": 3.9375, + "learning_rate": 0.002270036031043678, + "loss": 2.8815, + "mean_token_accuracy": 0.4445139169692993, + "num_tokens": 8735933638.0, + "step": 17089 + }, + { + "epoch": 4.621416982152515, + "grad_norm": 3.84375, + "learning_rate": 0.002269652887519205, + "loss": 2.7969, + "mean_token_accuracy": 0.45235055685043335, + "num_tokens": 8736429980.0, + "step": 17090 + }, + { + "epoch": 4.621687398593835, + "grad_norm": 28.75, + "learning_rate": 0.0022692700118674574, + "loss": 3.1487, + "mean_token_accuracy": 0.42677468061447144, + "num_tokens": 8736954062.0, + "step": 17091 + }, + { + "epoch": 4.621957815035154, + "grad_norm": 7.09375, + "learning_rate": 0.00226888740410018, + "loss": 2.8497, + "mean_token_accuracy": 0.4201065003871918, + "num_tokens": 8737478240.0, + "step": 17092 + }, + { + "epoch": 4.622228231476473, + "grad_norm": 2.90625, + "learning_rate": 0.0022685050642291134, + "loss": 2.8281, + "mean_token_accuracy": 0.4309801161289215, + "num_tokens": 8738002389.0, + "step": 17093 + }, + { + "epoch": 4.622498647917793, + "grad_norm": 4.4375, + "learning_rate": 0.002268122992265991, + "loss": 2.8238, + "mean_token_accuracy": 0.4560232162475586, + "num_tokens": 8738526629.0, + "step": 17094 + }, + { + "epoch": 4.6227690643591135, + "grad_norm": 3.359375, + "learning_rate": 0.002267741188222532, + "loss": 2.7746, + "mean_token_accuracy": 0.4402761459350586, + "num_tokens": 8739050809.0, + "step": 17095 + }, + { + "epoch": 4.623039480800433, + "grad_norm": 3.34375, + "learning_rate": 0.002267359652110455, + "loss": 2.7704, + "mean_token_accuracy": 0.45235663652420044, + "num_tokens": 8739574940.0, + "step": 17096 + }, + { + "epoch": 4.623309897241752, + "grad_norm": 3.34375, + "learning_rate": 0.0022669783839414643, + "loss": 2.76, + "mean_token_accuracy": 0.42287328839302063, + "num_tokens": 8740099209.0, + "step": 17097 + }, + { + "epoch": 4.623580313683072, + "grad_norm": 4.03125, + "learning_rate": 0.0022665973837272587, + "loss": 2.8657, + "mean_token_accuracy": 0.4379306733608246, + "num_tokens": 8740623482.0, + "step": 17098 + }, + { + "epoch": 4.623850730124391, + "grad_norm": 3.765625, + "learning_rate": 0.002266216651479529, + "loss": 2.6179, + "mean_token_accuracy": 0.4549049735069275, + "num_tokens": 8741147652.0, + "step": 17099 + }, + { + "epoch": 4.624121146565711, + "grad_norm": 3.71875, + "learning_rate": 0.0022658361872099565, + "loss": 2.7466, + "mean_token_accuracy": 0.4349749684333801, + "num_tokens": 8741671821.0, + "step": 17100 + }, + { + "epoch": 4.6243915630070305, + "grad_norm": 4.125, + "learning_rate": 0.0022654559909302146, + "loss": 2.6964, + "mean_token_accuracy": 0.4522005021572113, + "num_tokens": 8742159252.0, + "step": 17101 + }, + { + "epoch": 4.624661979448351, + "grad_norm": 4.125, + "learning_rate": 0.002265076062651971, + "loss": 2.7983, + "mean_token_accuracy": 0.4417869448661804, + "num_tokens": 8742683467.0, + "step": 17102 + }, + { + "epoch": 4.62493239588967, + "grad_norm": 3.765625, + "learning_rate": 0.0022646964023868804, + "loss": 2.5948, + "mean_token_accuracy": 0.4654909670352936, + "num_tokens": 8743188180.0, + "step": 17103 + }, + { + "epoch": 4.62520281233099, + "grad_norm": 3.546875, + "learning_rate": 0.0022643170101465924, + "loss": 2.7243, + "mean_token_accuracy": 0.4625956416130066, + "num_tokens": 8743680307.0, + "step": 17104 + }, + { + "epoch": 4.625473228772309, + "grad_norm": 3.859375, + "learning_rate": 0.0022639378859427503, + "loss": 2.9109, + "mean_token_accuracy": 0.44070786237716675, + "num_tokens": 8744204570.0, + "step": 17105 + }, + { + "epoch": 4.625743645213629, + "grad_norm": 3.8125, + "learning_rate": 0.0022635590297869833, + "loss": 2.7546, + "mean_token_accuracy": 0.43037015199661255, + "num_tokens": 8744728714.0, + "step": 17106 + }, + { + "epoch": 4.626014061654948, + "grad_norm": 4.15625, + "learning_rate": 0.0022631804416909187, + "loss": 2.7558, + "mean_token_accuracy": 0.44238919019699097, + "num_tokens": 8745185911.0, + "step": 17107 + }, + { + "epoch": 4.626284478096268, + "grad_norm": 3.390625, + "learning_rate": 0.0022628021216661697, + "loss": 2.7823, + "mean_token_accuracy": 0.4490728974342346, + "num_tokens": 8745710196.0, + "step": 17108 + }, + { + "epoch": 4.626554894537588, + "grad_norm": 4.0625, + "learning_rate": 0.002262424069724348, + "loss": 2.6061, + "mean_token_accuracy": 0.47417861223220825, + "num_tokens": 8746140617.0, + "step": 17109 + }, + { + "epoch": 4.626825310978908, + "grad_norm": 3.625, + "learning_rate": 0.00226204628587705, + "loss": 2.6938, + "mean_token_accuracy": 0.4462623596191406, + "num_tokens": 8746664874.0, + "step": 17110 + }, + { + "epoch": 4.627095727420227, + "grad_norm": 34.25, + "learning_rate": 0.00226166877013587, + "loss": 3.5978, + "mean_token_accuracy": 0.3382101058959961, + "num_tokens": 8747189033.0, + "step": 17111 + }, + { + "epoch": 4.627366143861547, + "grad_norm": 8.25, + "learning_rate": 0.0022612915225123896, + "loss": 2.7731, + "mean_token_accuracy": 0.42254120111465454, + "num_tokens": 8747713312.0, + "step": 17112 + }, + { + "epoch": 4.627636560302866, + "grad_norm": 2.796875, + "learning_rate": 0.0022609145430181835, + "loss": 2.7359, + "mean_token_accuracy": 0.4505358934402466, + "num_tokens": 8748237555.0, + "step": 17113 + }, + { + "epoch": 4.627906976744186, + "grad_norm": 3.40625, + "learning_rate": 0.00226053783166482, + "loss": 2.8567, + "mean_token_accuracy": 0.4391452968120575, + "num_tokens": 8748699855.0, + "step": 17114 + }, + { + "epoch": 4.6281773931855055, + "grad_norm": 3.71875, + "learning_rate": 0.002260161388463856, + "loss": 2.8489, + "mean_token_accuracy": 0.4513222575187683, + "num_tokens": 8749224086.0, + "step": 17115 + }, + { + "epoch": 4.628447809626826, + "grad_norm": 4.25, + "learning_rate": 0.0022597852134268424, + "loss": 2.7959, + "mean_token_accuracy": 0.44663506746292114, + "num_tokens": 8749748260.0, + "step": 17116 + }, + { + "epoch": 4.628718226068145, + "grad_norm": 3.84375, + "learning_rate": 0.002259409306565323, + "loss": 2.8158, + "mean_token_accuracy": 0.4460463523864746, + "num_tokens": 8750272459.0, + "step": 17117 + }, + { + "epoch": 4.628988642509465, + "grad_norm": 3.640625, + "learning_rate": 0.0022590336678908296, + "loss": 2.608, + "mean_token_accuracy": 0.4479674696922302, + "num_tokens": 8750777575.0, + "step": 17118 + }, + { + "epoch": 4.629259058950784, + "grad_norm": 3.734375, + "learning_rate": 0.0022586582974148894, + "loss": 2.7333, + "mean_token_accuracy": 0.44216424226760864, + "num_tokens": 8751301798.0, + "step": 17119 + }, + { + "epoch": 4.629529475392104, + "grad_norm": 4.3125, + "learning_rate": 0.002258283195149019, + "loss": 2.6758, + "mean_token_accuracy": 0.4419734477996826, + "num_tokens": 8751825935.0, + "step": 17120 + }, + { + "epoch": 4.629799891833423, + "grad_norm": 3.640625, + "learning_rate": 0.0022579083611047276, + "loss": 2.7355, + "mean_token_accuracy": 0.4577060043811798, + "num_tokens": 8752350164.0, + "step": 17121 + }, + { + "epoch": 4.630070308274743, + "grad_norm": 6.21875, + "learning_rate": 0.002257533795293516, + "loss": 2.5829, + "mean_token_accuracy": 0.5068938136100769, + "num_tokens": 8752850785.0, + "step": 17122 + }, + { + "epoch": 4.630340724716063, + "grad_norm": 3.53125, + "learning_rate": 0.002257159497726879, + "loss": 2.861, + "mean_token_accuracy": 0.43997377157211304, + "num_tokens": 8753374994.0, + "step": 17123 + }, + { + "epoch": 4.630611141157383, + "grad_norm": 4.5625, + "learning_rate": 0.0022567854684162987, + "loss": 2.8491, + "mean_token_accuracy": 0.4345848858356476, + "num_tokens": 8753899187.0, + "step": 17124 + }, + { + "epoch": 4.630881557598702, + "grad_norm": 3.65625, + "learning_rate": 0.0022564117073732523, + "loss": 2.8852, + "mean_token_accuracy": 0.4322337508201599, + "num_tokens": 8754423449.0, + "step": 17125 + }, + { + "epoch": 4.631151974040022, + "grad_norm": 4.125, + "learning_rate": 0.0022560382146092086, + "loss": 2.8155, + "mean_token_accuracy": 0.43415936827659607, + "num_tokens": 8754947626.0, + "step": 17126 + }, + { + "epoch": 4.631422390481341, + "grad_norm": 3.578125, + "learning_rate": 0.002255664990135626, + "loss": 2.9085, + "mean_token_accuracy": 0.4500476121902466, + "num_tokens": 8755418368.0, + "step": 17127 + }, + { + "epoch": 4.631692806922661, + "grad_norm": 3.59375, + "learning_rate": 0.002255292033963957, + "loss": 2.7217, + "mean_token_accuracy": 0.4491885304450989, + "num_tokens": 8755942607.0, + "step": 17128 + }, + { + "epoch": 4.6319632233639805, + "grad_norm": 3.546875, + "learning_rate": 0.0022549193461056456, + "loss": 2.6253, + "mean_token_accuracy": 0.4594653844833374, + "num_tokens": 8756466787.0, + "step": 17129 + }, + { + "epoch": 4.6322336398053, + "grad_norm": 6.65625, + "learning_rate": 0.002254546926572126, + "loss": 2.4739, + "mean_token_accuracy": 0.4920904040336609, + "num_tokens": 8756943027.0, + "step": 17130 + }, + { + "epoch": 4.63250405624662, + "grad_norm": 14.875, + "learning_rate": 0.002254174775374825, + "loss": 2.3422, + "mean_token_accuracy": 0.5264440774917603, + "num_tokens": 8757467259.0, + "step": 17131 + }, + { + "epoch": 4.63277447268794, + "grad_norm": 6.96875, + "learning_rate": 0.0022538028925251623, + "loss": 2.8098, + "mean_token_accuracy": 0.45246556401252747, + "num_tokens": 8757979410.0, + "step": 17132 + }, + { + "epoch": 4.633044889129259, + "grad_norm": 4.6875, + "learning_rate": 0.002253431278034547, + "loss": 2.8335, + "mean_token_accuracy": 0.4417804181575775, + "num_tokens": 8758433123.0, + "step": 17133 + }, + { + "epoch": 4.633315305570578, + "grad_norm": 11.6875, + "learning_rate": 0.0022530599319143833, + "loss": 2.4231, + "mean_token_accuracy": 0.5131155252456665, + "num_tokens": 8758893806.0, + "step": 17134 + }, + { + "epoch": 4.633585722011898, + "grad_norm": 3.609375, + "learning_rate": 0.002252688854176062, + "loss": 2.7766, + "mean_token_accuracy": 0.44225579500198364, + "num_tokens": 8759418017.0, + "step": 17135 + }, + { + "epoch": 4.633856138453218, + "grad_norm": 3.03125, + "learning_rate": 0.002252318044830972, + "loss": 2.6721, + "mean_token_accuracy": 0.4485543668270111, + "num_tokens": 8759856513.0, + "step": 17136 + }, + { + "epoch": 4.634126554894538, + "grad_norm": 3.09375, + "learning_rate": 0.00225194750389049, + "loss": 2.9128, + "mean_token_accuracy": 0.43287593126296997, + "num_tokens": 8760380757.0, + "step": 17137 + }, + { + "epoch": 4.634396971335857, + "grad_norm": 3.53125, + "learning_rate": 0.002251577231365985, + "loss": 2.7195, + "mean_token_accuracy": 0.4544793963432312, + "num_tokens": 8760904983.0, + "step": 17138 + }, + { + "epoch": 4.634667387777177, + "grad_norm": 3.703125, + "learning_rate": 0.0022512072272688165, + "loss": 2.8311, + "mean_token_accuracy": 0.4452677369117737, + "num_tokens": 8761429185.0, + "step": 17139 + }, + { + "epoch": 4.634937804218496, + "grad_norm": 3.875, + "learning_rate": 0.0022508374916103396, + "loss": 2.8823, + "mean_token_accuracy": 0.43593019247055054, + "num_tokens": 8761953262.0, + "step": 17140 + }, + { + "epoch": 4.635208220659816, + "grad_norm": 4.15625, + "learning_rate": 0.0022504680244018983, + "loss": 2.7468, + "mean_token_accuracy": 0.45288586616516113, + "num_tokens": 8762386239.0, + "step": 17141 + }, + { + "epoch": 4.635478637101135, + "grad_norm": 3.171875, + "learning_rate": 0.0022500988256548286, + "loss": 2.679, + "mean_token_accuracy": 0.4360989034175873, + "num_tokens": 8762910484.0, + "step": 17142 + }, + { + "epoch": 4.6357490535424555, + "grad_norm": 2.796875, + "learning_rate": 0.0022497298953804597, + "loss": 2.7737, + "mean_token_accuracy": 0.44775527715682983, + "num_tokens": 8763434716.0, + "step": 17143 + }, + { + "epoch": 4.636019469983775, + "grad_norm": 3.59375, + "learning_rate": 0.0022493612335901083, + "loss": 2.7666, + "mean_token_accuracy": 0.45258957147598267, + "num_tokens": 8763958814.0, + "step": 17144 + }, + { + "epoch": 4.636289886425095, + "grad_norm": 3.453125, + "learning_rate": 0.00224899284029509, + "loss": 2.7672, + "mean_token_accuracy": 0.43120327591896057, + "num_tokens": 8764483083.0, + "step": 17145 + }, + { + "epoch": 4.636560302866414, + "grad_norm": 3.734375, + "learning_rate": 0.0022486247155067062, + "loss": 2.7826, + "mean_token_accuracy": 0.4352053999900818, + "num_tokens": 8765007168.0, + "step": 17146 + }, + { + "epoch": 4.636830719307734, + "grad_norm": 3.953125, + "learning_rate": 0.0022482568592362517, + "loss": 2.7459, + "mean_token_accuracy": 0.4640311002731323, + "num_tokens": 8765485243.0, + "step": 17147 + }, + { + "epoch": 4.637101135749053, + "grad_norm": 4.0625, + "learning_rate": 0.0022478892714950143, + "loss": 2.7124, + "mean_token_accuracy": 0.4512466788291931, + "num_tokens": 8766009490.0, + "step": 17148 + }, + { + "epoch": 4.637371552190373, + "grad_norm": 3.8125, + "learning_rate": 0.0022475219522942714, + "loss": 2.6168, + "mean_token_accuracy": 0.4601394534111023, + "num_tokens": 8766533619.0, + "step": 17149 + }, + { + "epoch": 4.6376419686316925, + "grad_norm": 3.734375, + "learning_rate": 0.0022471549016452952, + "loss": 2.6307, + "mean_token_accuracy": 0.464336097240448, + "num_tokens": 8767047173.0, + "step": 17150 + }, + { + "epoch": 4.637912385073013, + "grad_norm": 35.0, + "learning_rate": 0.0022467881195593472, + "loss": 2.898, + "mean_token_accuracy": 0.4309561252593994, + "num_tokens": 8767525044.0, + "step": 17151 + }, + { + "epoch": 4.638182801514332, + "grad_norm": 7.375, + "learning_rate": 0.0022464216060476805, + "loss": 2.7668, + "mean_token_accuracy": 0.4547807574272156, + "num_tokens": 8767999786.0, + "step": 17152 + }, + { + "epoch": 4.638453217955652, + "grad_norm": 2.265625, + "learning_rate": 0.0022460553611215422, + "loss": 2.9198, + "mean_token_accuracy": 0.4407515525817871, + "num_tokens": 8768514453.0, + "step": 17153 + }, + { + "epoch": 4.638723634396971, + "grad_norm": 4.25, + "learning_rate": 0.0022456893847921694, + "loss": 2.7278, + "mean_token_accuracy": 0.421194463968277, + "num_tokens": 8769038668.0, + "step": 17154 + }, + { + "epoch": 4.638994050838291, + "grad_norm": 3.28125, + "learning_rate": 0.0022453236770707905, + "loss": 2.7854, + "mean_token_accuracy": 0.45446348190307617, + "num_tokens": 8769507954.0, + "step": 17155 + }, + { + "epoch": 4.63926446727961, + "grad_norm": 3.53125, + "learning_rate": 0.0022449582379686266, + "loss": 2.6616, + "mean_token_accuracy": 0.4335199296474457, + "num_tokens": 8770032224.0, + "step": 17156 + }, + { + "epoch": 4.6395348837209305, + "grad_norm": 4.4375, + "learning_rate": 0.0022445930674968913, + "loss": 2.6714, + "mean_token_accuracy": 0.4353815019130707, + "num_tokens": 8770556353.0, + "step": 17157 + }, + { + "epoch": 4.63980530016225, + "grad_norm": 3.65625, + "learning_rate": 0.002244228165666789, + "loss": 2.8061, + "mean_token_accuracy": 0.44800812005996704, + "num_tokens": 8771080411.0, + "step": 17158 + }, + { + "epoch": 4.64007571660357, + "grad_norm": 4.3125, + "learning_rate": 0.0022438635324895147, + "loss": 2.7302, + "mean_token_accuracy": 0.4519822597503662, + "num_tokens": 8771587811.0, + "step": 17159 + }, + { + "epoch": 4.640346133044889, + "grad_norm": 4.6875, + "learning_rate": 0.0022434991679762587, + "loss": 2.759, + "mean_token_accuracy": 0.4542986750602722, + "num_tokens": 8772026937.0, + "step": 17160 + }, + { + "epoch": 4.640616549486209, + "grad_norm": 4.75, + "learning_rate": 0.002243135072138199, + "loss": 2.5912, + "mean_token_accuracy": 0.4621753692626953, + "num_tokens": 8772551199.0, + "step": 17161 + }, + { + "epoch": 4.640886965927528, + "grad_norm": 3.828125, + "learning_rate": 0.002242771244986507, + "loss": 2.9348, + "mean_token_accuracy": 0.43762338161468506, + "num_tokens": 8773016216.0, + "step": 17162 + }, + { + "epoch": 4.641157382368848, + "grad_norm": 4.15625, + "learning_rate": 0.0022424076865323462, + "loss": 2.749, + "mean_token_accuracy": 0.4508328139781952, + "num_tokens": 8773540316.0, + "step": 17163 + }, + { + "epoch": 4.6414277988101675, + "grad_norm": 3.953125, + "learning_rate": 0.0022420443967868724, + "loss": 2.7343, + "mean_token_accuracy": 0.4403841495513916, + "num_tokens": 8774020611.0, + "step": 17164 + }, + { + "epoch": 4.641698215251488, + "grad_norm": 3.65625, + "learning_rate": 0.0022416813757612317, + "loss": 2.6769, + "mean_token_accuracy": 0.4409799873828888, + "num_tokens": 8774521200.0, + "step": 17165 + }, + { + "epoch": 4.641968631692807, + "grad_norm": 4.25, + "learning_rate": 0.002241318623466563, + "loss": 2.9672, + "mean_token_accuracy": 0.4275103807449341, + "num_tokens": 8775045469.0, + "step": 17166 + }, + { + "epoch": 4.642239048134127, + "grad_norm": 3.703125, + "learning_rate": 0.002240956139913997, + "loss": 2.8199, + "mean_token_accuracy": 0.44444501399993896, + "num_tokens": 8775512091.0, + "step": 17167 + }, + { + "epoch": 4.642509464575446, + "grad_norm": 4.03125, + "learning_rate": 0.002240593925114656, + "loss": 2.9707, + "mean_token_accuracy": 0.3924288749694824, + "num_tokens": 8776036283.0, + "step": 17168 + }, + { + "epoch": 4.642779881016766, + "grad_norm": 4.09375, + "learning_rate": 0.0022402319790796525, + "loss": 2.8187, + "mean_token_accuracy": 0.46565181016921997, + "num_tokens": 8776498641.0, + "step": 17169 + }, + { + "epoch": 4.643050297458085, + "grad_norm": 3.5625, + "learning_rate": 0.0022398703018200916, + "loss": 2.7125, + "mean_token_accuracy": 0.4406369924545288, + "num_tokens": 8776992695.0, + "step": 17170 + }, + { + "epoch": 4.643320713899405, + "grad_norm": 56.0, + "learning_rate": 0.0022395088933470733, + "loss": 3.9546, + "mean_token_accuracy": 0.31419020891189575, + "num_tokens": 8777516891.0, + "step": 17171 + }, + { + "epoch": 4.643591130340725, + "grad_norm": 9.5, + "learning_rate": 0.0022391477536716836, + "loss": 2.8049, + "mean_token_accuracy": 0.43066471815109253, + "num_tokens": 8778041114.0, + "step": 17172 + }, + { + "epoch": 4.643861546782045, + "grad_norm": 3.5625, + "learning_rate": 0.0022387868828050057, + "loss": 2.6368, + "mean_token_accuracy": 0.447213351726532, + "num_tokens": 8778565340.0, + "step": 17173 + }, + { + "epoch": 4.644131963223364, + "grad_norm": 4.21875, + "learning_rate": 0.002238426280758111, + "loss": 2.6543, + "mean_token_accuracy": 0.43982845544815063, + "num_tokens": 8779089537.0, + "step": 17174 + }, + { + "epoch": 4.644402379664683, + "grad_norm": 3.765625, + "learning_rate": 0.002238065947542065, + "loss": 2.9163, + "mean_token_accuracy": 0.4386034905910492, + "num_tokens": 8779613673.0, + "step": 17175 + }, + { + "epoch": 4.644672796106003, + "grad_norm": 3.703125, + "learning_rate": 0.0022377058831679216, + "loss": 2.5891, + "mean_token_accuracy": 0.4746536314487457, + "num_tokens": 8780055039.0, + "step": 17176 + }, + { + "epoch": 4.644943212547323, + "grad_norm": 3.1875, + "learning_rate": 0.002237346087646731, + "loss": 2.8568, + "mean_token_accuracy": 0.4313603639602661, + "num_tokens": 8780579297.0, + "step": 17177 + }, + { + "epoch": 4.6452136289886425, + "grad_norm": 4.25, + "learning_rate": 0.0022369865609895307, + "loss": 2.9464, + "mean_token_accuracy": 0.4291796386241913, + "num_tokens": 8781103577.0, + "step": 17178 + }, + { + "epoch": 4.645484045429962, + "grad_norm": 4.625, + "learning_rate": 0.0022366273032073526, + "loss": 2.7663, + "mean_token_accuracy": 0.41675305366516113, + "num_tokens": 8781627746.0, + "step": 17179 + }, + { + "epoch": 4.645754461871282, + "grad_norm": 3.953125, + "learning_rate": 0.0022362683143112216, + "loss": 2.8644, + "mean_token_accuracy": 0.4211350679397583, + "num_tokens": 8782151973.0, + "step": 17180 + }, + { + "epoch": 4.646024878312601, + "grad_norm": 4.40625, + "learning_rate": 0.0022359095943121515, + "loss": 2.8061, + "mean_token_accuracy": 0.43206730484962463, + "num_tokens": 8782652097.0, + "step": 17181 + }, + { + "epoch": 4.646295294753921, + "grad_norm": 3.328125, + "learning_rate": 0.0022355511432211473, + "loss": 2.9025, + "mean_token_accuracy": 0.43647080659866333, + "num_tokens": 8783176347.0, + "step": 17182 + }, + { + "epoch": 4.64656571119524, + "grad_norm": 4.15625, + "learning_rate": 0.002235192961049209, + "loss": 2.8473, + "mean_token_accuracy": 0.44093191623687744, + "num_tokens": 8783646071.0, + "step": 17183 + }, + { + "epoch": 4.64683612763656, + "grad_norm": 3.4375, + "learning_rate": 0.0022348350478073272, + "loss": 2.8275, + "mean_token_accuracy": 0.4487912654876709, + "num_tokens": 8784170179.0, + "step": 17184 + }, + { + "epoch": 4.64710654407788, + "grad_norm": 4.4375, + "learning_rate": 0.002234477403506481, + "loss": 2.8023, + "mean_token_accuracy": 0.44956672191619873, + "num_tokens": 8784694205.0, + "step": 17185 + }, + { + "epoch": 4.6473769605192, + "grad_norm": 4.28125, + "learning_rate": 0.0022341200281576473, + "loss": 2.7763, + "mean_token_accuracy": 0.43021154403686523, + "num_tokens": 8785204851.0, + "step": 17186 + }, + { + "epoch": 4.647647376960519, + "grad_norm": 3.734375, + "learning_rate": 0.0022337629217717885, + "loss": 2.8376, + "mean_token_accuracy": 0.4382907748222351, + "num_tokens": 8785729125.0, + "step": 17187 + }, + { + "epoch": 4.647917793401839, + "grad_norm": 3.8125, + "learning_rate": 0.0022334060843598644, + "loss": 2.7038, + "mean_token_accuracy": 0.4470968246459961, + "num_tokens": 8786253074.0, + "step": 17188 + }, + { + "epoch": 4.648188209843158, + "grad_norm": 3.375, + "learning_rate": 0.002233049515932823, + "loss": 2.5767, + "mean_token_accuracy": 0.4786549508571625, + "num_tokens": 8786714934.0, + "step": 17189 + }, + { + "epoch": 4.648458626284478, + "grad_norm": 3.765625, + "learning_rate": 0.0022326932165016027, + "loss": 2.6431, + "mean_token_accuracy": 0.451741099357605, + "num_tokens": 8787239219.0, + "step": 17190 + }, + { + "epoch": 4.6487290427257975, + "grad_norm": 33.5, + "learning_rate": 0.0022323371860771377, + "loss": 3.599, + "mean_token_accuracy": 0.42274007201194763, + "num_tokens": 8787763349.0, + "step": 17191 + }, + { + "epoch": 4.6489994591671175, + "grad_norm": 7.34375, + "learning_rate": 0.0022319814246703528, + "loss": 2.9539, + "mean_token_accuracy": 0.4191320538520813, + "num_tokens": 8788287557.0, + "step": 17192 + }, + { + "epoch": 4.649269875608437, + "grad_norm": 3.515625, + "learning_rate": 0.002231625932292161, + "loss": 2.7522, + "mean_token_accuracy": 0.43665215373039246, + "num_tokens": 8788811755.0, + "step": 17193 + }, + { + "epoch": 4.649540292049757, + "grad_norm": 4.03125, + "learning_rate": 0.0022312707089534745, + "loss": 2.6765, + "mean_token_accuracy": 0.43585672974586487, + "num_tokens": 8789335990.0, + "step": 17194 + }, + { + "epoch": 4.649810708491076, + "grad_norm": 3.296875, + "learning_rate": 0.0022309157546651884, + "loss": 2.7738, + "mean_token_accuracy": 0.4277629256248474, + "num_tokens": 8789858039.0, + "step": 17195 + }, + { + "epoch": 4.650081124932396, + "grad_norm": 4.0, + "learning_rate": 0.0022305610694381946, + "loss": 2.6112, + "mean_token_accuracy": 0.46021997928619385, + "num_tokens": 8790339430.0, + "step": 17196 + }, + { + "epoch": 4.650351541373715, + "grad_norm": 3.328125, + "learning_rate": 0.0022302066532833777, + "loss": 2.6341, + "mean_token_accuracy": 0.4351416826248169, + "num_tokens": 8790863584.0, + "step": 17197 + }, + { + "epoch": 4.650621957815035, + "grad_norm": 3.453125, + "learning_rate": 0.0022298525062116104, + "loss": 2.7503, + "mean_token_accuracy": 0.463248610496521, + "num_tokens": 8791338297.0, + "step": 17198 + }, + { + "epoch": 4.650892374256355, + "grad_norm": 3.546875, + "learning_rate": 0.0022294986282337583, + "loss": 2.6627, + "mean_token_accuracy": 0.4535398483276367, + "num_tokens": 8791848536.0, + "step": 17199 + }, + { + "epoch": 4.651162790697675, + "grad_norm": 3.75, + "learning_rate": 0.002229145019360683, + "loss": 2.9165, + "mean_token_accuracy": 0.4427734911441803, + "num_tokens": 8792372615.0, + "step": 17200 + }, + { + "epoch": 4.651433207138994, + "grad_norm": 3.890625, + "learning_rate": 0.002228791679603231, + "loss": 2.7407, + "mean_token_accuracy": 0.44385114312171936, + "num_tokens": 8792896889.0, + "step": 17201 + }, + { + "epoch": 4.651703623580314, + "grad_norm": 3.34375, + "learning_rate": 0.002228438608972243, + "loss": 2.8283, + "mean_token_accuracy": 0.4522574543952942, + "num_tokens": 8793421171.0, + "step": 17202 + }, + { + "epoch": 4.651974040021633, + "grad_norm": 4.125, + "learning_rate": 0.0022280858074785557, + "loss": 2.7506, + "mean_token_accuracy": 0.4550904631614685, + "num_tokens": 8793945303.0, + "step": 17203 + }, + { + "epoch": 4.652244456462953, + "grad_norm": 5.0, + "learning_rate": 0.002227733275132992, + "loss": 2.8819, + "mean_token_accuracy": 0.44732826948165894, + "num_tokens": 8794469572.0, + "step": 17204 + }, + { + "epoch": 4.6525148729042725, + "grad_norm": 3.875, + "learning_rate": 0.0022273810119463684, + "loss": 2.5908, + "mean_token_accuracy": 0.4453570246696472, + "num_tokens": 8794993766.0, + "step": 17205 + }, + { + "epoch": 4.6527852893455925, + "grad_norm": 3.265625, + "learning_rate": 0.002227029017929495, + "loss": 2.7153, + "mean_token_accuracy": 0.44688770174980164, + "num_tokens": 8795518038.0, + "step": 17206 + }, + { + "epoch": 4.653055705786912, + "grad_norm": 3.640625, + "learning_rate": 0.0022266772930931707, + "loss": 2.7993, + "mean_token_accuracy": 0.43210873007774353, + "num_tokens": 8796042187.0, + "step": 17207 + }, + { + "epoch": 4.653326122228232, + "grad_norm": 4.46875, + "learning_rate": 0.0022263258374481867, + "loss": 2.6188, + "mean_token_accuracy": 0.46386492252349854, + "num_tokens": 8796521297.0, + "step": 17208 + }, + { + "epoch": 4.653596538669551, + "grad_norm": 4.0, + "learning_rate": 0.0022259746510053286, + "loss": 2.6263, + "mean_token_accuracy": 0.4556286931037903, + "num_tokens": 8797045451.0, + "step": 17209 + }, + { + "epoch": 4.653866955110871, + "grad_norm": 5.0, + "learning_rate": 0.0022256237337753703, + "loss": 2.8135, + "mean_token_accuracy": 0.44206398725509644, + "num_tokens": 8797569641.0, + "step": 17210 + }, + { + "epoch": 4.65413737155219, + "grad_norm": 58.5, + "learning_rate": 0.0022252730857690805, + "loss": 3.6152, + "mean_token_accuracy": 0.3727589249610901, + "num_tokens": 8798093757.0, + "step": 17211 + }, + { + "epoch": 4.6544077879935095, + "grad_norm": 7.09375, + "learning_rate": 0.002224922706997217, + "loss": 2.8698, + "mean_token_accuracy": 0.43366628885269165, + "num_tokens": 8798617933.0, + "step": 17212 + }, + { + "epoch": 4.65467820443483, + "grad_norm": 2.609375, + "learning_rate": 0.0022245725974705303, + "loss": 2.7908, + "mean_token_accuracy": 0.43649429082870483, + "num_tokens": 8799142100.0, + "step": 17213 + }, + { + "epoch": 4.65494862087615, + "grad_norm": 3.296875, + "learning_rate": 0.0022242227571997635, + "loss": 2.6845, + "mean_token_accuracy": 0.456083208322525, + "num_tokens": 8799666289.0, + "step": 17214 + }, + { + "epoch": 4.655219037317469, + "grad_norm": 3.53125, + "learning_rate": 0.00222387318619565, + "loss": 2.8064, + "mean_token_accuracy": 0.4569007456302643, + "num_tokens": 8800103886.0, + "step": 17215 + }, + { + "epoch": 4.655489453758788, + "grad_norm": 3.125, + "learning_rate": 0.0022235238844689163, + "loss": 2.7485, + "mean_token_accuracy": 0.46047693490982056, + "num_tokens": 8800594194.0, + "step": 17216 + }, + { + "epoch": 4.655759870200108, + "grad_norm": 3.6875, + "learning_rate": 0.00222317485203028, + "loss": 2.7668, + "mean_token_accuracy": 0.4581148624420166, + "num_tokens": 8801051076.0, + "step": 17217 + }, + { + "epoch": 4.656030286641428, + "grad_norm": 2.890625, + "learning_rate": 0.00222282608889045, + "loss": 2.7752, + "mean_token_accuracy": 0.43073874711990356, + "num_tokens": 8801542335.0, + "step": 17218 + }, + { + "epoch": 4.6563007030827475, + "grad_norm": 3.421875, + "learning_rate": 0.0022224775950601275, + "loss": 2.4977, + "mean_token_accuracy": 0.47971290349960327, + "num_tokens": 8802037301.0, + "step": 17219 + }, + { + "epoch": 4.656571119524067, + "grad_norm": 3.53125, + "learning_rate": 0.0022221293705500054, + "loss": 2.7628, + "mean_token_accuracy": 0.4412565231323242, + "num_tokens": 8802561423.0, + "step": 17220 + }, + { + "epoch": 4.656841535965387, + "grad_norm": 3.703125, + "learning_rate": 0.0022217814153707677, + "loss": 2.681, + "mean_token_accuracy": 0.4516475796699524, + "num_tokens": 8803079855.0, + "step": 17221 + }, + { + "epoch": 4.657111952406706, + "grad_norm": 3.59375, + "learning_rate": 0.0022214337295330916, + "loss": 2.7851, + "mean_token_accuracy": 0.44500699639320374, + "num_tokens": 8803603948.0, + "step": 17222 + }, + { + "epoch": 4.657382368848026, + "grad_norm": 4.28125, + "learning_rate": 0.0022210863130476453, + "loss": 2.8556, + "mean_token_accuracy": 0.4439096450805664, + "num_tokens": 8804128014.0, + "step": 17223 + }, + { + "epoch": 4.657652785289345, + "grad_norm": 4.09375, + "learning_rate": 0.0022207391659250886, + "loss": 2.8414, + "mean_token_accuracy": 0.4144880771636963, + "num_tokens": 8804652135.0, + "step": 17224 + }, + { + "epoch": 4.657923201730665, + "grad_norm": 3.53125, + "learning_rate": 0.002220392288176071, + "loss": 2.7617, + "mean_token_accuracy": 0.45696374773979187, + "num_tokens": 8805176383.0, + "step": 17225 + }, + { + "epoch": 4.6581936181719845, + "grad_norm": 4.375, + "learning_rate": 0.002220045679811237, + "loss": 2.6869, + "mean_token_accuracy": 0.4350471496582031, + "num_tokens": 8805700496.0, + "step": 17226 + }, + { + "epoch": 4.658464034613305, + "grad_norm": 4.53125, + "learning_rate": 0.002219699340841223, + "loss": 2.782, + "mean_token_accuracy": 0.42462772130966187, + "num_tokens": 8806173824.0, + "step": 17227 + }, + { + "epoch": 4.658734451054624, + "grad_norm": 3.21875, + "learning_rate": 0.002219353271276653, + "loss": 2.8279, + "mean_token_accuracy": 0.44171029329299927, + "num_tokens": 8806678864.0, + "step": 17228 + }, + { + "epoch": 4.659004867495944, + "grad_norm": 3.78125, + "learning_rate": 0.0022190074711281485, + "loss": 2.6984, + "mean_token_accuracy": 0.4639210104942322, + "num_tokens": 8807195014.0, + "step": 17229 + }, + { + "epoch": 4.659275283937263, + "grad_norm": 4.09375, + "learning_rate": 0.0022186619404063173, + "loss": 3.0237, + "mean_token_accuracy": 0.40858298540115356, + "num_tokens": 8807677824.0, + "step": 17230 + }, + { + "epoch": 4.659545700378583, + "grad_norm": 112.5, + "learning_rate": 0.002218316679121761, + "loss": 4.796, + "mean_token_accuracy": 0.254787802696228, + "num_tokens": 8808201943.0, + "step": 17231 + }, + { + "epoch": 4.659816116819902, + "grad_norm": 7.0, + "learning_rate": 0.0022179716872850748, + "loss": 2.9107, + "mean_token_accuracy": 0.439744234085083, + "num_tokens": 8808726132.0, + "step": 17232 + }, + { + "epoch": 4.6600865332612225, + "grad_norm": 3.5, + "learning_rate": 0.0022176269649068444, + "loss": 2.8599, + "mean_token_accuracy": 0.4258228540420532, + "num_tokens": 8809250405.0, + "step": 17233 + }, + { + "epoch": 4.660356949702542, + "grad_norm": 3.59375, + "learning_rate": 0.002217282511997645, + "loss": 2.6484, + "mean_token_accuracy": 0.42233824729919434, + "num_tokens": 8809774532.0, + "step": 17234 + }, + { + "epoch": 4.660627366143862, + "grad_norm": 3.078125, + "learning_rate": 0.002216938328568047, + "loss": 2.7488, + "mean_token_accuracy": 0.44468897581100464, + "num_tokens": 8810265517.0, + "step": 17235 + }, + { + "epoch": 4.660897782585181, + "grad_norm": 3.703125, + "learning_rate": 0.002216594414628609, + "loss": 2.917, + "mean_token_accuracy": 0.43116509914398193, + "num_tokens": 8810789738.0, + "step": 17236 + }, + { + "epoch": 4.661168199026501, + "grad_norm": 4.40625, + "learning_rate": 0.002216250770189886, + "loss": 2.8084, + "mean_token_accuracy": 0.44302603602409363, + "num_tokens": 8811313923.0, + "step": 17237 + }, + { + "epoch": 4.66143861546782, + "grad_norm": 3.8125, + "learning_rate": 0.0022159073952624214, + "loss": 2.7746, + "mean_token_accuracy": 0.43700939416885376, + "num_tokens": 8811838080.0, + "step": 17238 + }, + { + "epoch": 4.66170903190914, + "grad_norm": 3.625, + "learning_rate": 0.002215564289856748, + "loss": 2.8622, + "mean_token_accuracy": 0.43882501125335693, + "num_tokens": 8812362296.0, + "step": 17239 + }, + { + "epoch": 4.6619794483504595, + "grad_norm": 3.5625, + "learning_rate": 0.002215221453983397, + "loss": 2.6023, + "mean_token_accuracy": 0.4783807396888733, + "num_tokens": 8812850487.0, + "step": 17240 + }, + { + "epoch": 4.66224986479178, + "grad_norm": 3.84375, + "learning_rate": 0.0022148788876528865, + "loss": 2.8627, + "mean_token_accuracy": 0.42534878849983215, + "num_tokens": 8813374703.0, + "step": 17241 + }, + { + "epoch": 4.662520281233099, + "grad_norm": 3.28125, + "learning_rate": 0.002214536590875726, + "loss": 2.6583, + "mean_token_accuracy": 0.4560853838920593, + "num_tokens": 8813868516.0, + "step": 17242 + }, + { + "epoch": 4.662790697674419, + "grad_norm": 3.015625, + "learning_rate": 0.0022141945636624206, + "loss": 2.7667, + "mean_token_accuracy": 0.44954007863998413, + "num_tokens": 8814373947.0, + "step": 17243 + }, + { + "epoch": 4.663061114115738, + "grad_norm": 4.34375, + "learning_rate": 0.002213852806023463, + "loss": 2.5692, + "mean_token_accuracy": 0.4735407829284668, + "num_tokens": 8814898190.0, + "step": 17244 + }, + { + "epoch": 4.663331530557058, + "grad_norm": 3.140625, + "learning_rate": 0.002213511317969339, + "loss": 2.7631, + "mean_token_accuracy": 0.45621368288993835, + "num_tokens": 8815422369.0, + "step": 17245 + }, + { + "epoch": 4.663601946998377, + "grad_norm": 3.84375, + "learning_rate": 0.0022131700995105274, + "loss": 2.7051, + "mean_token_accuracy": 0.44575127959251404, + "num_tokens": 8815941991.0, + "step": 17246 + }, + { + "epoch": 4.6638723634396975, + "grad_norm": 3.78125, + "learning_rate": 0.002212829150657498, + "loss": 2.5473, + "mean_token_accuracy": 0.4604160785675049, + "num_tokens": 8816463795.0, + "step": 17247 + }, + { + "epoch": 4.664142779881017, + "grad_norm": 4.09375, + "learning_rate": 0.00221248847142071, + "loss": 2.6509, + "mean_token_accuracy": 0.4322415590286255, + "num_tokens": 8816988067.0, + "step": 17248 + }, + { + "epoch": 4.664413196322337, + "grad_norm": 3.171875, + "learning_rate": 0.0022121480618106193, + "loss": 2.7535, + "mean_token_accuracy": 0.4447811245918274, + "num_tokens": 8817512256.0, + "step": 17249 + }, + { + "epoch": 4.664683612763656, + "grad_norm": 3.484375, + "learning_rate": 0.0022118079218376697, + "loss": 2.8137, + "mean_token_accuracy": 0.44429486989974976, + "num_tokens": 8818036426.0, + "step": 17250 + }, + { + "epoch": 4.664954029204976, + "grad_norm": 20.25, + "learning_rate": 0.0022114680515122965, + "loss": 2.6877, + "mean_token_accuracy": 0.42021942138671875, + "num_tokens": 8818560675.0, + "step": 17251 + }, + { + "epoch": 4.665224445646295, + "grad_norm": 7.125, + "learning_rate": 0.00221112845084493, + "loss": 2.8174, + "mean_token_accuracy": 0.4129314124584198, + "num_tokens": 8819084861.0, + "step": 17252 + }, + { + "epoch": 4.665494862087614, + "grad_norm": 3.015625, + "learning_rate": 0.0022107891198459894, + "loss": 2.7759, + "mean_token_accuracy": 0.4418480396270752, + "num_tokens": 8819589693.0, + "step": 17253 + }, + { + "epoch": 4.6657652785289345, + "grad_norm": 3.84375, + "learning_rate": 0.0022104500585258843, + "loss": 2.776, + "mean_token_accuracy": 0.43902575969696045, + "num_tokens": 8820113915.0, + "step": 17254 + }, + { + "epoch": 4.666035694970255, + "grad_norm": 3.453125, + "learning_rate": 0.0022101112668950207, + "loss": 2.7006, + "mean_token_accuracy": 0.43646636605262756, + "num_tokens": 8820631909.0, + "step": 17255 + }, + { + "epoch": 4.666306111411574, + "grad_norm": 3.6875, + "learning_rate": 0.002209772744963793, + "loss": 3.0435, + "mean_token_accuracy": 0.40258705615997314, + "num_tokens": 8821156156.0, + "step": 17256 + }, + { + "epoch": 4.666576527852893, + "grad_norm": 3.703125, + "learning_rate": 0.0022094344927425873, + "loss": 2.7802, + "mean_token_accuracy": 0.4340376853942871, + "num_tokens": 8821680334.0, + "step": 17257 + }, + { + "epoch": 4.666846944294213, + "grad_norm": 4.46875, + "learning_rate": 0.0022090965102417835, + "loss": 2.6534, + "mean_token_accuracy": 0.44194644689559937, + "num_tokens": 8822204600.0, + "step": 17258 + }, + { + "epoch": 4.667117360735533, + "grad_norm": 4.28125, + "learning_rate": 0.00220875879747175, + "loss": 2.6553, + "mean_token_accuracy": 0.45647957921028137, + "num_tokens": 8822728858.0, + "step": 17259 + }, + { + "epoch": 4.667387777176852, + "grad_norm": 3.921875, + "learning_rate": 0.0022084213544428517, + "loss": 2.784, + "mean_token_accuracy": 0.44916170835494995, + "num_tokens": 8823229645.0, + "step": 17260 + }, + { + "epoch": 4.667658193618172, + "grad_norm": 3.65625, + "learning_rate": 0.0022080841811654393, + "loss": 2.7361, + "mean_token_accuracy": 0.46036243438720703, + "num_tokens": 8823753712.0, + "step": 17261 + }, + { + "epoch": 4.667928610059492, + "grad_norm": 4.28125, + "learning_rate": 0.0022077472776498588, + "loss": 2.8126, + "mean_token_accuracy": 0.4381612539291382, + "num_tokens": 8824277896.0, + "step": 17262 + }, + { + "epoch": 4.668199026500811, + "grad_norm": 3.34375, + "learning_rate": 0.0022074106439064494, + "loss": 2.6414, + "mean_token_accuracy": 0.44862329959869385, + "num_tokens": 8824802102.0, + "step": 17263 + }, + { + "epoch": 4.668469442942131, + "grad_norm": 3.515625, + "learning_rate": 0.002207074279945539, + "loss": 2.5745, + "mean_token_accuracy": 0.4562035799026489, + "num_tokens": 8825326340.0, + "step": 17264 + }, + { + "epoch": 4.66873985938345, + "grad_norm": 3.921875, + "learning_rate": 0.0022067381857774466, + "loss": 2.7776, + "mean_token_accuracy": 0.459194153547287, + "num_tokens": 8825850589.0, + "step": 17265 + }, + { + "epoch": 4.66901027582477, + "grad_norm": 3.546875, + "learning_rate": 0.002206402361412487, + "loss": 2.6688, + "mean_token_accuracy": 0.44521304965019226, + "num_tokens": 8826269542.0, + "step": 17266 + }, + { + "epoch": 4.669280692266089, + "grad_norm": 3.203125, + "learning_rate": 0.002206066806860964, + "loss": 2.7298, + "mean_token_accuracy": 0.44426578283309937, + "num_tokens": 8826793821.0, + "step": 17267 + }, + { + "epoch": 4.6695511087074095, + "grad_norm": 3.859375, + "learning_rate": 0.0022057315221331707, + "loss": 2.7299, + "mean_token_accuracy": 0.4446716606616974, + "num_tokens": 8827317943.0, + "step": 17268 + }, + { + "epoch": 4.669821525148729, + "grad_norm": 3.640625, + "learning_rate": 0.002205396507239398, + "loss": 2.8208, + "mean_token_accuracy": 0.4488789141178131, + "num_tokens": 8827820497.0, + "step": 17269 + }, + { + "epoch": 4.670091941590049, + "grad_norm": 3.53125, + "learning_rate": 0.0022050617621899235, + "loss": 2.6095, + "mean_token_accuracy": 0.4709673225879669, + "num_tokens": 8828344662.0, + "step": 17270 + }, + { + "epoch": 4.670362358031368, + "grad_norm": 19.625, + "learning_rate": 0.0022047272869950177, + "loss": 2.2289, + "mean_token_accuracy": 0.5165493488311768, + "num_tokens": 8828840226.0, + "step": 17271 + }, + { + "epoch": 4.670632774472688, + "grad_norm": 10.625, + "learning_rate": 0.002204393081664944, + "loss": 2.9688, + "mean_token_accuracy": 0.4560609459877014, + "num_tokens": 8829252878.0, + "step": 17272 + }, + { + "epoch": 4.670903190914007, + "grad_norm": 3.28125, + "learning_rate": 0.0022040591462099586, + "loss": 2.8785, + "mean_token_accuracy": 0.41351065039634705, + "num_tokens": 8829777024.0, + "step": 17273 + }, + { + "epoch": 4.671173607355327, + "grad_norm": 3.8125, + "learning_rate": 0.0022037254806403033, + "loss": 2.6608, + "mean_token_accuracy": 0.45757126808166504, + "num_tokens": 8830301141.0, + "step": 17274 + }, + { + "epoch": 4.671444023796647, + "grad_norm": 3.875, + "learning_rate": 0.0022033920849662205, + "loss": 2.9022, + "mean_token_accuracy": 0.4367874562740326, + "num_tokens": 8830825347.0, + "step": 17275 + }, + { + "epoch": 4.671714440237967, + "grad_norm": 5.0625, + "learning_rate": 0.0022030589591979363, + "loss": 2.7531, + "mean_token_accuracy": 0.46096694469451904, + "num_tokens": 8831349621.0, + "step": 17276 + }, + { + "epoch": 4.671984856679286, + "grad_norm": 3.203125, + "learning_rate": 0.002202726103345673, + "loss": 2.8759, + "mean_token_accuracy": 0.4449078142642975, + "num_tokens": 8831873894.0, + "step": 17277 + }, + { + "epoch": 4.672255273120606, + "grad_norm": 4.4375, + "learning_rate": 0.002202393517419645, + "loss": 2.7495, + "mean_token_accuracy": 0.46939510107040405, + "num_tokens": 8832345182.0, + "step": 17278 + }, + { + "epoch": 4.672525689561925, + "grad_norm": 4.3125, + "learning_rate": 0.0022020612014300544, + "loss": 2.7499, + "mean_token_accuracy": 0.4215855002403259, + "num_tokens": 8832869401.0, + "step": 17279 + }, + { + "epoch": 4.672796106003245, + "grad_norm": 3.390625, + "learning_rate": 0.002201729155387101, + "loss": 2.8709, + "mean_token_accuracy": 0.44476407766342163, + "num_tokens": 8833393638.0, + "step": 17280 + }, + { + "epoch": 4.673066522444564, + "grad_norm": 4.65625, + "learning_rate": 0.0022013973793009707, + "loss": 2.818, + "mean_token_accuracy": 0.4410482347011566, + "num_tokens": 8833917825.0, + "step": 17281 + }, + { + "epoch": 4.6733369388858845, + "grad_norm": 3.921875, + "learning_rate": 0.0022010658731818435, + "loss": 2.6852, + "mean_token_accuracy": 0.44043219089508057, + "num_tokens": 8834441965.0, + "step": 17282 + }, + { + "epoch": 4.673607355327204, + "grad_norm": 3.890625, + "learning_rate": 0.002200734637039891, + "loss": 2.7944, + "mean_token_accuracy": 0.44604209065437317, + "num_tokens": 8834955654.0, + "step": 17283 + }, + { + "epoch": 4.673877771768524, + "grad_norm": 4.0, + "learning_rate": 0.002200403670885278, + "loss": 2.7818, + "mean_token_accuracy": 0.43879538774490356, + "num_tokens": 8835479816.0, + "step": 17284 + }, + { + "epoch": 4.674148188209843, + "grad_norm": 4.78125, + "learning_rate": 0.0022000729747281567, + "loss": 2.6691, + "mean_token_accuracy": 0.44903868436813354, + "num_tokens": 8835952686.0, + "step": 17285 + }, + { + "epoch": 4.674418604651163, + "grad_norm": 4.71875, + "learning_rate": 0.0021997425485786766, + "loss": 2.712, + "mean_token_accuracy": 0.44777071475982666, + "num_tokens": 8836476896.0, + "step": 17286 + }, + { + "epoch": 4.674689021092482, + "grad_norm": 3.828125, + "learning_rate": 0.002199412392446975, + "loss": 2.7404, + "mean_token_accuracy": 0.441811203956604, + "num_tokens": 8837001136.0, + "step": 17287 + }, + { + "epoch": 4.674959437533802, + "grad_norm": 3.8125, + "learning_rate": 0.002199082506343181, + "loss": 2.756, + "mean_token_accuracy": 0.4379119277000427, + "num_tokens": 8837525342.0, + "step": 17288 + }, + { + "epoch": 4.675229853975122, + "grad_norm": 3.828125, + "learning_rate": 0.0021987528902774186, + "loss": 2.7226, + "mean_token_accuracy": 0.45603060722351074, + "num_tokens": 8838049611.0, + "step": 17289 + }, + { + "epoch": 4.675500270416442, + "grad_norm": 3.703125, + "learning_rate": 0.0021984235442597997, + "loss": 2.7254, + "mean_token_accuracy": 0.45515894889831543, + "num_tokens": 8838546005.0, + "step": 17290 + }, + { + "epoch": 4.675770686857761, + "grad_norm": 27.875, + "learning_rate": 0.0021980944683004304, + "loss": 2.7155, + "mean_token_accuracy": 0.42948710918426514, + "num_tokens": 8839070249.0, + "step": 17291 + }, + { + "epoch": 4.676041103299081, + "grad_norm": 7.4375, + "learning_rate": 0.0021977656624094072, + "loss": 2.5855, + "mean_token_accuracy": 0.4766296148300171, + "num_tokens": 8839594517.0, + "step": 17292 + }, + { + "epoch": 4.6763115197404, + "grad_norm": 3.84375, + "learning_rate": 0.0021974371265968194, + "loss": 2.9495, + "mean_token_accuracy": 0.43022310733795166, + "num_tokens": 8840118687.0, + "step": 17293 + }, + { + "epoch": 4.676581936181719, + "grad_norm": 4.21875, + "learning_rate": 0.002197108860872747, + "loss": 2.6674, + "mean_token_accuracy": 0.43353408575057983, + "num_tokens": 8840642973.0, + "step": 17294 + }, + { + "epoch": 4.676852352623039, + "grad_norm": 4.8125, + "learning_rate": 0.0021967808652472633, + "loss": 2.6293, + "mean_token_accuracy": 0.44952255487442017, + "num_tokens": 8841076702.0, + "step": 17295 + }, + { + "epoch": 4.6771227690643595, + "grad_norm": 4.75, + "learning_rate": 0.002196453139730431, + "loss": 2.6589, + "mean_token_accuracy": 0.4539312720298767, + "num_tokens": 8841546883.0, + "step": 17296 + }, + { + "epoch": 4.677393185505679, + "grad_norm": 3.453125, + "learning_rate": 0.002196125684332305, + "loss": 2.8104, + "mean_token_accuracy": 0.45579609274864197, + "num_tokens": 8842071058.0, + "step": 17297 + }, + { + "epoch": 4.677663601946998, + "grad_norm": 4.03125, + "learning_rate": 0.0021957984990629343, + "loss": 2.892, + "mean_token_accuracy": 0.42818915843963623, + "num_tokens": 8842493104.0, + "step": 17298 + }, + { + "epoch": 4.677934018388318, + "grad_norm": 3.828125, + "learning_rate": 0.002195471583932357, + "loss": 2.7426, + "mean_token_accuracy": 0.44145259261131287, + "num_tokens": 8843017166.0, + "step": 17299 + }, + { + "epoch": 4.678204434829638, + "grad_norm": 3.71875, + "learning_rate": 0.002195144938950603, + "loss": 2.8675, + "mean_token_accuracy": 0.4314958453178406, + "num_tokens": 8843541311.0, + "step": 17300 + }, + { + "epoch": 4.678474851270957, + "grad_norm": 4.1875, + "learning_rate": 0.0021948185641276962, + "loss": 2.7818, + "mean_token_accuracy": 0.4292977750301361, + "num_tokens": 8844065479.0, + "step": 17301 + }, + { + "epoch": 4.6787452677122765, + "grad_norm": 3.4375, + "learning_rate": 0.0021944924594736502, + "loss": 2.8046, + "mean_token_accuracy": 0.4321375787258148, + "num_tokens": 8844589639.0, + "step": 17302 + }, + { + "epoch": 4.679015684153597, + "grad_norm": 3.890625, + "learning_rate": 0.0021941666249984722, + "loss": 2.795, + "mean_token_accuracy": 0.4432840347290039, + "num_tokens": 8845113874.0, + "step": 17303 + }, + { + "epoch": 4.679286100594916, + "grad_norm": 20.5, + "learning_rate": 0.0021938410607121573, + "loss": 2.766, + "mean_token_accuracy": 0.43964600563049316, + "num_tokens": 8845598436.0, + "step": 17304 + }, + { + "epoch": 4.679556517036236, + "grad_norm": 5.96875, + "learning_rate": 0.0021935157666246956, + "loss": 2.9854, + "mean_token_accuracy": 0.4144498407840729, + "num_tokens": 8846122569.0, + "step": 17305 + }, + { + "epoch": 4.679826933477555, + "grad_norm": 3.53125, + "learning_rate": 0.002193190742746067, + "loss": 2.8879, + "mean_token_accuracy": 0.43885302543640137, + "num_tokens": 8846646757.0, + "step": 17306 + }, + { + "epoch": 4.680097349918875, + "grad_norm": 3.078125, + "learning_rate": 0.0021928659890862487, + "loss": 2.8028, + "mean_token_accuracy": 0.4420436918735504, + "num_tokens": 8847106254.0, + "step": 17307 + }, + { + "epoch": 4.680367766360194, + "grad_norm": 4.75, + "learning_rate": 0.002192541505655199, + "loss": 2.6748, + "mean_token_accuracy": 0.4614861011505127, + "num_tokens": 8847630454.0, + "step": 17308 + }, + { + "epoch": 4.6806381828015144, + "grad_norm": 3.921875, + "learning_rate": 0.0021922172924628788, + "loss": 2.562, + "mean_token_accuracy": 0.43716752529144287, + "num_tokens": 8848154681.0, + "step": 17309 + }, + { + "epoch": 4.680908599242834, + "grad_norm": 3.21875, + "learning_rate": 0.002191893349519234, + "loss": 2.6684, + "mean_token_accuracy": 0.4665473699569702, + "num_tokens": 8848678855.0, + "step": 17310 + }, + { + "epoch": 4.681179015684154, + "grad_norm": 25.625, + "learning_rate": 0.002191569676834204, + "loss": 2.0442, + "mean_token_accuracy": 0.5499672889709473, + "num_tokens": 8849202942.0, + "step": 17311 + }, + { + "epoch": 4.681449432125473, + "grad_norm": 7.0625, + "learning_rate": 0.0021912462744177203, + "loss": 2.7527, + "mean_token_accuracy": 0.4685184359550476, + "num_tokens": 8849727226.0, + "step": 17312 + }, + { + "epoch": 4.681719848566793, + "grad_norm": 2.9375, + "learning_rate": 0.0021909231422797054, + "loss": 2.7874, + "mean_token_accuracy": 0.41908854246139526, + "num_tokens": 8850251414.0, + "step": 17313 + }, + { + "epoch": 4.681990265008112, + "grad_norm": 3.515625, + "learning_rate": 0.002190600280430074, + "loss": 2.7557, + "mean_token_accuracy": 0.44127339124679565, + "num_tokens": 8850714653.0, + "step": 17314 + }, + { + "epoch": 4.682260681449432, + "grad_norm": 2.84375, + "learning_rate": 0.0021902776888787345, + "loss": 2.7648, + "mean_token_accuracy": 0.44942790269851685, + "num_tokens": 8851238908.0, + "step": 17315 + }, + { + "epoch": 4.6825310978907515, + "grad_norm": 3.546875, + "learning_rate": 0.0021899553676355816, + "loss": 2.9554, + "mean_token_accuracy": 0.4034305214881897, + "num_tokens": 8851756174.0, + "step": 17316 + }, + { + "epoch": 4.682801514332072, + "grad_norm": 3.796875, + "learning_rate": 0.0021896333167105066, + "loss": 2.9611, + "mean_token_accuracy": 0.4247022569179535, + "num_tokens": 8852228100.0, + "step": 17317 + }, + { + "epoch": 4.683071930773391, + "grad_norm": 3.59375, + "learning_rate": 0.002189311536113392, + "loss": 2.8319, + "mean_token_accuracy": 0.4320635199546814, + "num_tokens": 8852752102.0, + "step": 17318 + }, + { + "epoch": 4.683342347214711, + "grad_norm": 3.28125, + "learning_rate": 0.0021889900258541084, + "loss": 2.805, + "mean_token_accuracy": 0.4492315948009491, + "num_tokens": 8853276328.0, + "step": 17319 + }, + { + "epoch": 4.68361276365603, + "grad_norm": 10.0625, + "learning_rate": 0.0021886687859425237, + "loss": 2.7206, + "mean_token_accuracy": 0.47519099712371826, + "num_tokens": 8853800500.0, + "step": 17320 + }, + { + "epoch": 4.68388318009735, + "grad_norm": 3.265625, + "learning_rate": 0.002188347816388492, + "loss": 2.7085, + "mean_token_accuracy": 0.4578390419483185, + "num_tokens": 8854279497.0, + "step": 17321 + }, + { + "epoch": 4.684153596538669, + "grad_norm": 4.09375, + "learning_rate": 0.0021880271172018636, + "loss": 2.6885, + "mean_token_accuracy": 0.4419364333152771, + "num_tokens": 8854803690.0, + "step": 17322 + }, + { + "epoch": 4.6844240129799894, + "grad_norm": 3.0, + "learning_rate": 0.0021877066883924778, + "loss": 2.9222, + "mean_token_accuracy": 0.44258999824523926, + "num_tokens": 8855327802.0, + "step": 17323 + }, + { + "epoch": 4.684694429421309, + "grad_norm": 4.1875, + "learning_rate": 0.0021873865299701653, + "loss": 2.7482, + "mean_token_accuracy": 0.4449879825115204, + "num_tokens": 8855852064.0, + "step": 17324 + }, + { + "epoch": 4.684964845862629, + "grad_norm": 12.375, + "learning_rate": 0.002187066641944751, + "loss": 2.7591, + "mean_token_accuracy": 0.4752722680568695, + "num_tokens": 8856376303.0, + "step": 17325 + }, + { + "epoch": 4.685235262303948, + "grad_norm": 8.8125, + "learning_rate": 0.0021867470243260483, + "loss": 2.5139, + "mean_token_accuracy": 0.47295573353767395, + "num_tokens": 8856860412.0, + "step": 17326 + }, + { + "epoch": 4.685505678745268, + "grad_norm": 3.265625, + "learning_rate": 0.002186427677123866, + "loss": 2.7271, + "mean_token_accuracy": 0.47085168957710266, + "num_tokens": 8857261827.0, + "step": 17327 + }, + { + "epoch": 4.685776095186587, + "grad_norm": 3.609375, + "learning_rate": 0.002186108600348001, + "loss": 2.6822, + "mean_token_accuracy": 0.4452310800552368, + "num_tokens": 8857729985.0, + "step": 17328 + }, + { + "epoch": 4.686046511627907, + "grad_norm": 3.0625, + "learning_rate": 0.0021857897940082446, + "loss": 2.9285, + "mean_token_accuracy": 0.42744165658950806, + "num_tokens": 8858254036.0, + "step": 17329 + }, + { + "epoch": 4.6863169280692265, + "grad_norm": 3.484375, + "learning_rate": 0.002185471258114378, + "loss": 2.8024, + "mean_token_accuracy": 0.4453433156013489, + "num_tokens": 8858778264.0, + "step": 17330 + }, + { + "epoch": 4.686587344510547, + "grad_norm": 12.8125, + "learning_rate": 0.002185152992676174, + "loss": 2.0988, + "mean_token_accuracy": 0.5356391668319702, + "num_tokens": 8859302509.0, + "step": 17331 + }, + { + "epoch": 4.686857760951866, + "grad_norm": 7.125, + "learning_rate": 0.002184834997703401, + "loss": 2.6868, + "mean_token_accuracy": 0.44043850898742676, + "num_tokens": 8859826687.0, + "step": 17332 + }, + { + "epoch": 4.687128177393186, + "grad_norm": 3.4375, + "learning_rate": 0.0021845172732058117, + "loss": 2.8218, + "mean_token_accuracy": 0.41227495670318604, + "num_tokens": 8860350870.0, + "step": 17333 + }, + { + "epoch": 4.687398593834505, + "grad_norm": 3.203125, + "learning_rate": 0.0021841998191931594, + "loss": 2.9029, + "mean_token_accuracy": 0.4357646405696869, + "num_tokens": 8860875031.0, + "step": 17334 + }, + { + "epoch": 4.687669010275824, + "grad_norm": 5.46875, + "learning_rate": 0.00218388263567518, + "loss": 2.988, + "mean_token_accuracy": 0.406869113445282, + "num_tokens": 8861399250.0, + "step": 17335 + }, + { + "epoch": 4.687939426717144, + "grad_norm": 3.4375, + "learning_rate": 0.002183565722661609, + "loss": 2.6175, + "mean_token_accuracy": 0.46337422728538513, + "num_tokens": 8861887112.0, + "step": 17336 + }, + { + "epoch": 4.6882098431584645, + "grad_norm": 3.90625, + "learning_rate": 0.002183249080162169, + "loss": 2.6496, + "mean_token_accuracy": 0.4527551233768463, + "num_tokens": 8862411354.0, + "step": 17337 + }, + { + "epoch": 4.688480259599784, + "grad_norm": 3.75, + "learning_rate": 0.0021829327081865756, + "loss": 2.6926, + "mean_token_accuracy": 0.44196179509162903, + "num_tokens": 8862935558.0, + "step": 17338 + }, + { + "epoch": 4.688750676041103, + "grad_norm": 4.75, + "learning_rate": 0.002182616606744537, + "loss": 2.5908, + "mean_token_accuracy": 0.46382254362106323, + "num_tokens": 8863438873.0, + "step": 17339 + }, + { + "epoch": 4.689021092482423, + "grad_norm": 4.3125, + "learning_rate": 0.002182300775845749, + "loss": 2.7436, + "mean_token_accuracy": 0.4577341675758362, + "num_tokens": 8863963021.0, + "step": 17340 + }, + { + "epoch": 4.689291508923743, + "grad_norm": 4.25, + "learning_rate": 0.002181985215499906, + "loss": 2.6894, + "mean_token_accuracy": 0.4416722059249878, + "num_tokens": 8864485105.0, + "step": 17341 + }, + { + "epoch": 4.689561925365062, + "grad_norm": 4.15625, + "learning_rate": 0.002181669925716689, + "loss": 2.9552, + "mean_token_accuracy": 0.44395720958709717, + "num_tokens": 8864980487.0, + "step": 17342 + }, + { + "epoch": 4.689832341806381, + "grad_norm": 4.875, + "learning_rate": 0.0021813549065057697, + "loss": 2.9508, + "mean_token_accuracy": 0.44137442111968994, + "num_tokens": 8865446797.0, + "step": 17343 + }, + { + "epoch": 4.6901027582477015, + "grad_norm": 4.90625, + "learning_rate": 0.0021810401578768175, + "loss": 2.7193, + "mean_token_accuracy": 0.44930434226989746, + "num_tokens": 8865971009.0, + "step": 17344 + }, + { + "epoch": 4.690373174689021, + "grad_norm": 3.375, + "learning_rate": 0.0021807256798394878, + "loss": 2.8391, + "mean_token_accuracy": 0.4137027859687805, + "num_tokens": 8866483102.0, + "step": 17345 + }, + { + "epoch": 4.690643591130341, + "grad_norm": 7.28125, + "learning_rate": 0.0021804114724034303, + "loss": 2.5016, + "mean_token_accuracy": 0.506067156791687, + "num_tokens": 8867007348.0, + "step": 17346 + }, + { + "epoch": 4.69091400757166, + "grad_norm": 2.59375, + "learning_rate": 0.0021800975355782855, + "loss": 2.797, + "mean_token_accuracy": 0.43300431966781616, + "num_tokens": 8867531615.0, + "step": 17347 + }, + { + "epoch": 4.69118442401298, + "grad_norm": 3.78125, + "learning_rate": 0.0021797838693736844, + "loss": 2.7323, + "mean_token_accuracy": 0.45268934965133667, + "num_tokens": 8868055870.0, + "step": 17348 + }, + { + "epoch": 4.691454840454299, + "grad_norm": 3.796875, + "learning_rate": 0.002179470473799254, + "loss": 2.7507, + "mean_token_accuracy": 0.4569080173969269, + "num_tokens": 8868580138.0, + "step": 17349 + }, + { + "epoch": 4.691725256895619, + "grad_norm": 3.5, + "learning_rate": 0.0021791573488646084, + "loss": 2.9378, + "mean_token_accuracy": 0.4308910071849823, + "num_tokens": 8869062668.0, + "step": 17350 + }, + { + "epoch": 4.691995673336939, + "grad_norm": 38.0, + "learning_rate": 0.0021788444945793563, + "loss": 3.0856, + "mean_token_accuracy": 0.43389806151390076, + "num_tokens": 8869524484.0, + "step": 17351 + }, + { + "epoch": 4.692266089778259, + "grad_norm": 9.375, + "learning_rate": 0.0021785319109530964, + "loss": 2.9701, + "mean_token_accuracy": 0.43383854627609253, + "num_tokens": 8870048738.0, + "step": 17352 + }, + { + "epoch": 4.692536506219578, + "grad_norm": 9.25, + "learning_rate": 0.00217821959799542, + "loss": 2.5832, + "mean_token_accuracy": 0.453564316034317, + "num_tokens": 8870572927.0, + "step": 17353 + }, + { + "epoch": 4.692806922660898, + "grad_norm": 4.5, + "learning_rate": 0.0021779075557159084, + "loss": 2.8538, + "mean_token_accuracy": 0.44610172510147095, + "num_tokens": 8871077721.0, + "step": 17354 + }, + { + "epoch": 4.693077339102217, + "grad_norm": 3.3125, + "learning_rate": 0.002177595784124138, + "loss": 2.9099, + "mean_token_accuracy": 0.43449294567108154, + "num_tokens": 8871601994.0, + "step": 17355 + }, + { + "epoch": 4.693347755543537, + "grad_norm": 8.0, + "learning_rate": 0.0021772842832296743, + "loss": 2.1713, + "mean_token_accuracy": 0.490475594997406, + "num_tokens": 8872126215.0, + "step": 17356 + }, + { + "epoch": 4.693618171984856, + "grad_norm": 4.875, + "learning_rate": 0.0021769730530420747, + "loss": 2.5854, + "mean_token_accuracy": 0.4548065662384033, + "num_tokens": 8872650494.0, + "step": 17357 + }, + { + "epoch": 4.6938885884261765, + "grad_norm": 5.8125, + "learning_rate": 0.0021766620935708882, + "loss": 2.4947, + "mean_token_accuracy": 0.4872834384441376, + "num_tokens": 8873174746.0, + "step": 17358 + }, + { + "epoch": 4.694159004867496, + "grad_norm": 3.0625, + "learning_rate": 0.002176351404825656, + "loss": 2.634, + "mean_token_accuracy": 0.45090681314468384, + "num_tokens": 8873698831.0, + "step": 17359 + }, + { + "epoch": 4.694429421308816, + "grad_norm": 3.390625, + "learning_rate": 0.0021760409868159113, + "loss": 2.6621, + "mean_token_accuracy": 0.4494856595993042, + "num_tokens": 8874223035.0, + "step": 17360 + }, + { + "epoch": 4.694699837750135, + "grad_norm": 11.6875, + "learning_rate": 0.0021757308395511797, + "loss": 2.5151, + "mean_token_accuracy": 0.5027036070823669, + "num_tokens": 8874713992.0, + "step": 17361 + }, + { + "epoch": 4.694970254191455, + "grad_norm": 3.796875, + "learning_rate": 0.002175420963040975, + "loss": 3.0072, + "mean_token_accuracy": 0.41431209444999695, + "num_tokens": 8875238194.0, + "step": 17362 + }, + { + "epoch": 4.695240670632774, + "grad_norm": 3.46875, + "learning_rate": 0.002175111357294808, + "loss": 2.7537, + "mean_token_accuracy": 0.4518888592720032, + "num_tokens": 8875762273.0, + "step": 17363 + }, + { + "epoch": 4.695511087074094, + "grad_norm": 4.90625, + "learning_rate": 0.0021748020223221758, + "loss": 2.9627, + "mean_token_accuracy": 0.42814141511917114, + "num_tokens": 8876286551.0, + "step": 17364 + }, + { + "epoch": 4.695781503515414, + "grad_norm": 11.8125, + "learning_rate": 0.002174492958132572, + "loss": 2.5829, + "mean_token_accuracy": 0.47767966985702515, + "num_tokens": 8876810837.0, + "step": 17365 + }, + { + "epoch": 4.696051919956734, + "grad_norm": 5.3125, + "learning_rate": 0.0021741841647354774, + "loss": 2.7391, + "mean_token_accuracy": 0.43538719415664673, + "num_tokens": 8877334980.0, + "step": 17366 + }, + { + "epoch": 4.696322336398053, + "grad_norm": 3.421875, + "learning_rate": 0.002173875642140368, + "loss": 2.7564, + "mean_token_accuracy": 0.4353345036506653, + "num_tokens": 8877859117.0, + "step": 17367 + }, + { + "epoch": 4.696592752839373, + "grad_norm": 6.53125, + "learning_rate": 0.0021735673903567096, + "loss": 2.5041, + "mean_token_accuracy": 0.49259382486343384, + "num_tokens": 8878383381.0, + "step": 17368 + }, + { + "epoch": 4.696863169280692, + "grad_norm": 2.796875, + "learning_rate": 0.0021732594093939613, + "loss": 2.7867, + "mean_token_accuracy": 0.46059250831604004, + "num_tokens": 8878863412.0, + "step": 17369 + }, + { + "epoch": 4.697133585722012, + "grad_norm": 4.90625, + "learning_rate": 0.0021729516992615707, + "loss": 2.5993, + "mean_token_accuracy": 0.44437074661254883, + "num_tokens": 8879387527.0, + "step": 17370 + }, + { + "epoch": 4.697404002163331, + "grad_norm": 20.875, + "learning_rate": 0.0021726442599689817, + "loss": 2.676, + "mean_token_accuracy": 0.4745168685913086, + "num_tokens": 8879911659.0, + "step": 17371 + }, + { + "epoch": 4.6976744186046515, + "grad_norm": 4.9375, + "learning_rate": 0.002172337091525626, + "loss": 2.9627, + "mean_token_accuracy": 0.447612464427948, + "num_tokens": 8880358873.0, + "step": 17372 + }, + { + "epoch": 4.697944835045971, + "grad_norm": 3.578125, + "learning_rate": 0.0021720301939409294, + "loss": 2.7992, + "mean_token_accuracy": 0.43967321515083313, + "num_tokens": 8880882978.0, + "step": 17373 + }, + { + "epoch": 4.698215251487291, + "grad_norm": 3.5625, + "learning_rate": 0.002171723567224306, + "loss": 2.8963, + "mean_token_accuracy": 0.4115353524684906, + "num_tokens": 8881407206.0, + "step": 17374 + }, + { + "epoch": 4.69848566792861, + "grad_norm": 3.46875, + "learning_rate": 0.0021714172113851678, + "loss": 2.7771, + "mean_token_accuracy": 0.42648157477378845, + "num_tokens": 8881883126.0, + "step": 17375 + }, + { + "epoch": 4.698756084369929, + "grad_norm": 3.15625, + "learning_rate": 0.002171111126432911, + "loss": 2.8083, + "mean_token_accuracy": 0.4442700743675232, + "num_tokens": 8882402155.0, + "step": 17376 + }, + { + "epoch": 4.699026500811249, + "grad_norm": 4.4375, + "learning_rate": 0.0021708053123769304, + "loss": 2.6419, + "mean_token_accuracy": 0.45038703083992004, + "num_tokens": 8882891230.0, + "step": 17377 + }, + { + "epoch": 4.699296917252569, + "grad_norm": 3.0625, + "learning_rate": 0.0021704997692266067, + "loss": 2.8122, + "mean_token_accuracy": 0.44716233015060425, + "num_tokens": 8883415502.0, + "step": 17378 + }, + { + "epoch": 4.699567333693889, + "grad_norm": 3.71875, + "learning_rate": 0.0021701944969913173, + "loss": 2.6572, + "mean_token_accuracy": 0.44783544540405273, + "num_tokens": 8883939775.0, + "step": 17379 + }, + { + "epoch": 4.699837750135208, + "grad_norm": 3.5, + "learning_rate": 0.002169889495680426, + "loss": 2.8031, + "mean_token_accuracy": 0.4570823907852173, + "num_tokens": 8884403159.0, + "step": 17380 + }, + { + "epoch": 4.700108166576528, + "grad_norm": 4.25, + "learning_rate": 0.0021695847653032943, + "loss": 2.8922, + "mean_token_accuracy": 0.43683871626853943, + "num_tokens": 8884927328.0, + "step": 17381 + }, + { + "epoch": 4.700378583017848, + "grad_norm": 3.09375, + "learning_rate": 0.002169280305869269, + "loss": 2.7497, + "mean_token_accuracy": 0.4597308933734894, + "num_tokens": 8885451605.0, + "step": 17382 + }, + { + "epoch": 4.700648999459167, + "grad_norm": 3.671875, + "learning_rate": 0.0021689761173876933, + "loss": 2.5364, + "mean_token_accuracy": 0.46470028162002563, + "num_tokens": 8885920993.0, + "step": 17383 + }, + { + "epoch": 4.700919415900486, + "grad_norm": 3.890625, + "learning_rate": 0.0021686721998679017, + "loss": 2.5608, + "mean_token_accuracy": 0.4448862373828888, + "num_tokens": 8886445119.0, + "step": 17384 + }, + { + "epoch": 4.701189832341806, + "grad_norm": 3.015625, + "learning_rate": 0.002168368553319219, + "loss": 2.8957, + "mean_token_accuracy": 0.42257410287857056, + "num_tokens": 8886969395.0, + "step": 17385 + }, + { + "epoch": 4.701460248783126, + "grad_norm": 3.375, + "learning_rate": 0.002168065177750959, + "loss": 2.6868, + "mean_token_accuracy": 0.4353579580783844, + "num_tokens": 8887493669.0, + "step": 17386 + }, + { + "epoch": 4.701730665224446, + "grad_norm": 3.453125, + "learning_rate": 0.0021677620731724336, + "loss": 2.8049, + "mean_token_accuracy": 0.44439998269081116, + "num_tokens": 8888017927.0, + "step": 17387 + }, + { + "epoch": 4.702001081665765, + "grad_norm": 3.609375, + "learning_rate": 0.0021674592395929428, + "loss": 2.8183, + "mean_token_accuracy": 0.44707775115966797, + "num_tokens": 8888518563.0, + "step": 17388 + }, + { + "epoch": 4.702271498107085, + "grad_norm": 4.71875, + "learning_rate": 0.0021671566770217754, + "loss": 2.6487, + "mean_token_accuracy": 0.4385969936847687, + "num_tokens": 8889042704.0, + "step": 17389 + }, + { + "epoch": 4.702541914548404, + "grad_norm": 3.25, + "learning_rate": 0.002166854385468218, + "loss": 2.7565, + "mean_token_accuracy": 0.4431774318218231, + "num_tokens": 8889566985.0, + "step": 17390 + }, + { + "epoch": 4.702812330989724, + "grad_norm": 38.5, + "learning_rate": 0.0021665523649415435, + "loss": 2.2605, + "mean_token_accuracy": 0.47936713695526123, + "num_tokens": 8890091207.0, + "step": 17391 + }, + { + "epoch": 4.7030827474310435, + "grad_norm": 11.25, + "learning_rate": 0.00216625061545102, + "loss": 2.9491, + "mean_token_accuracy": 0.4187261760234833, + "num_tokens": 8890615410.0, + "step": 17392 + }, + { + "epoch": 4.703353163872364, + "grad_norm": 3.375, + "learning_rate": 0.002165949137005907, + "loss": 2.7574, + "mean_token_accuracy": 0.4554133117198944, + "num_tokens": 8891139662.0, + "step": 17393 + }, + { + "epoch": 4.703623580313683, + "grad_norm": 3.625, + "learning_rate": 0.0021656479296154533, + "loss": 2.813, + "mean_token_accuracy": 0.4564899206161499, + "num_tokens": 8891663936.0, + "step": 17394 + }, + { + "epoch": 4.703893996755003, + "grad_norm": 4.34375, + "learning_rate": 0.002165346993288901, + "loss": 2.6283, + "mean_token_accuracy": 0.4698958396911621, + "num_tokens": 8892188143.0, + "step": 17395 + }, + { + "epoch": 4.704164413196322, + "grad_norm": 3.15625, + "learning_rate": 0.002165046328035485, + "loss": 2.8726, + "mean_token_accuracy": 0.4440114498138428, + "num_tokens": 8892712357.0, + "step": 17396 + }, + { + "epoch": 4.704434829637642, + "grad_norm": 3.6875, + "learning_rate": 0.002164745933864428, + "loss": 2.7578, + "mean_token_accuracy": 0.4637675881385803, + "num_tokens": 8893226780.0, + "step": 17397 + }, + { + "epoch": 4.704705246078961, + "grad_norm": 5.21875, + "learning_rate": 0.0021644458107849505, + "loss": 2.8619, + "mean_token_accuracy": 0.4397440552711487, + "num_tokens": 8893751003.0, + "step": 17398 + }, + { + "epoch": 4.704975662520281, + "grad_norm": 5.09375, + "learning_rate": 0.002164145958806258, + "loss": 2.506, + "mean_token_accuracy": 0.502180814743042, + "num_tokens": 8894260492.0, + "step": 17399 + }, + { + "epoch": 4.705246078961601, + "grad_norm": 3.453125, + "learning_rate": 0.0021638463779375516, + "loss": 2.7323, + "mean_token_accuracy": 0.46071937680244446, + "num_tokens": 8894746071.0, + "step": 17400 + }, + { + "epoch": 4.705516495402921, + "grad_norm": 4.25, + "learning_rate": 0.0021635470681880256, + "loss": 2.7932, + "mean_token_accuracy": 0.4569518566131592, + "num_tokens": 8895201343.0, + "step": 17401 + }, + { + "epoch": 4.70578691184424, + "grad_norm": 4.78125, + "learning_rate": 0.00216324802956686, + "loss": 2.6952, + "mean_token_accuracy": 0.4344896674156189, + "num_tokens": 8895725521.0, + "step": 17402 + }, + { + "epoch": 4.70605732828556, + "grad_norm": 4.4375, + "learning_rate": 0.002162949262083232, + "loss": 2.6677, + "mean_token_accuracy": 0.46178296208381653, + "num_tokens": 8896249598.0, + "step": 17403 + }, + { + "epoch": 4.706327744726879, + "grad_norm": 3.828125, + "learning_rate": 0.00216265076574631, + "loss": 2.6688, + "mean_token_accuracy": 0.45889222621917725, + "num_tokens": 8896773552.0, + "step": 17404 + }, + { + "epoch": 4.706598161168199, + "grad_norm": 3.984375, + "learning_rate": 0.00216235254056525, + "loss": 2.6262, + "mean_token_accuracy": 0.4656716287136078, + "num_tokens": 8897297811.0, + "step": 17405 + }, + { + "epoch": 4.7068685776095185, + "grad_norm": 4.75, + "learning_rate": 0.0021620545865492042, + "loss": 2.6503, + "mean_token_accuracy": 0.4542638659477234, + "num_tokens": 8897822082.0, + "step": 17406 + }, + { + "epoch": 4.707138994050839, + "grad_norm": 3.953125, + "learning_rate": 0.0021617569037073155, + "loss": 2.8376, + "mean_token_accuracy": 0.4398157596588135, + "num_tokens": 8898346234.0, + "step": 17407 + }, + { + "epoch": 4.707409410492158, + "grad_norm": 3.40625, + "learning_rate": 0.0021614594920487157, + "loss": 2.7303, + "mean_token_accuracy": 0.4462897777557373, + "num_tokens": 8898839704.0, + "step": 17408 + }, + { + "epoch": 4.707679826933478, + "grad_norm": 3.5625, + "learning_rate": 0.0021611623515825316, + "loss": 2.6819, + "mean_token_accuracy": 0.44559377431869507, + "num_tokens": 8899363881.0, + "step": 17409 + }, + { + "epoch": 4.707950243374797, + "grad_norm": 3.75, + "learning_rate": 0.0021608654823178793, + "loss": 2.9149, + "mean_token_accuracy": 0.41715094447135925, + "num_tokens": 8899888118.0, + "step": 17410 + }, + { + "epoch": 4.708220659816117, + "grad_norm": 10.0, + "learning_rate": 0.0021605688842638695, + "loss": 2.0377, + "mean_token_accuracy": 0.5350087285041809, + "num_tokens": 8900412322.0, + "step": 17411 + }, + { + "epoch": 4.708491076257436, + "grad_norm": 6.4375, + "learning_rate": 0.0021602725574296, + "loss": 2.6002, + "mean_token_accuracy": 0.4720664620399475, + "num_tokens": 8900903002.0, + "step": 17412 + }, + { + "epoch": 4.708761492698756, + "grad_norm": 2.71875, + "learning_rate": 0.002159976501824165, + "loss": 2.7658, + "mean_token_accuracy": 0.44293105602264404, + "num_tokens": 8901427239.0, + "step": 17413 + }, + { + "epoch": 4.709031909140076, + "grad_norm": 32.0, + "learning_rate": 0.0021596807174566472, + "loss": 2.7612, + "mean_token_accuracy": 0.43979331851005554, + "num_tokens": 8901951468.0, + "step": 17414 + }, + { + "epoch": 4.709302325581396, + "grad_norm": 6.875, + "learning_rate": 0.0021593852043361233, + "loss": 2.997, + "mean_token_accuracy": 0.4145873785018921, + "num_tokens": 8902475658.0, + "step": 17415 + }, + { + "epoch": 4.709572742022715, + "grad_norm": 4.0625, + "learning_rate": 0.00215908996247166, + "loss": 2.7749, + "mean_token_accuracy": 0.4482843279838562, + "num_tokens": 8902960577.0, + "step": 17416 + }, + { + "epoch": 4.709843158464034, + "grad_norm": 3.265625, + "learning_rate": 0.002158794991872315, + "loss": 2.6702, + "mean_token_accuracy": 0.45702147483825684, + "num_tokens": 8903484808.0, + "step": 17417 + }, + { + "epoch": 4.710113574905354, + "grad_norm": 3.890625, + "learning_rate": 0.002158500292547141, + "loss": 2.7909, + "mean_token_accuracy": 0.4335222840309143, + "num_tokens": 8904009076.0, + "step": 17418 + }, + { + "epoch": 4.710383991346674, + "grad_norm": 2.953125, + "learning_rate": 0.002158205864505178, + "loss": 2.5652, + "mean_token_accuracy": 0.45843273401260376, + "num_tokens": 8904533130.0, + "step": 17419 + }, + { + "epoch": 4.7106544077879935, + "grad_norm": 4.5, + "learning_rate": 0.0021579117077554625, + "loss": 2.683, + "mean_token_accuracy": 0.4426826238632202, + "num_tokens": 8905057413.0, + "step": 17420 + }, + { + "epoch": 4.710924824229313, + "grad_norm": 3.21875, + "learning_rate": 0.0021576178223070177, + "loss": 2.7703, + "mean_token_accuracy": 0.4370476007461548, + "num_tokens": 8905581576.0, + "step": 17421 + }, + { + "epoch": 4.711195240670633, + "grad_norm": 4.0625, + "learning_rate": 0.0021573242081688623, + "loss": 2.9387, + "mean_token_accuracy": 0.4486062228679657, + "num_tokens": 8906045295.0, + "step": 17422 + }, + { + "epoch": 4.711465657111953, + "grad_norm": 3.484375, + "learning_rate": 0.002157030865350004, + "loss": 2.6414, + "mean_token_accuracy": 0.46797946095466614, + "num_tokens": 8906569483.0, + "step": 17423 + }, + { + "epoch": 4.711736073553272, + "grad_norm": 3.3125, + "learning_rate": 0.0021567377938594458, + "loss": 2.8435, + "mean_token_accuracy": 0.4237357974052429, + "num_tokens": 8907058545.0, + "step": 17424 + }, + { + "epoch": 4.712006489994591, + "grad_norm": 4.0, + "learning_rate": 0.0021564449937061785, + "loss": 2.6485, + "mean_token_accuracy": 0.4503505229949951, + "num_tokens": 8907582707.0, + "step": 17425 + }, + { + "epoch": 4.712276906435911, + "grad_norm": 3.703125, + "learning_rate": 0.0021561524648991835, + "loss": 2.7578, + "mean_token_accuracy": 0.4555127024650574, + "num_tokens": 8908106962.0, + "step": 17426 + }, + { + "epoch": 4.7125473228772305, + "grad_norm": 3.953125, + "learning_rate": 0.002155860207447441, + "loss": 2.7735, + "mean_token_accuracy": 0.45219385623931885, + "num_tokens": 8908615557.0, + "step": 17427 + }, + { + "epoch": 4.712817739318551, + "grad_norm": 3.296875, + "learning_rate": 0.002155568221359917, + "loss": 2.6357, + "mean_token_accuracy": 0.4689588248729706, + "num_tokens": 8909106711.0, + "step": 17428 + }, + { + "epoch": 4.71308815575987, + "grad_norm": 3.0, + "learning_rate": 0.0021552765066455678, + "loss": 2.7987, + "mean_token_accuracy": 0.4350096583366394, + "num_tokens": 8909593396.0, + "step": 17429 + }, + { + "epoch": 4.71335857220119, + "grad_norm": 4.15625, + "learning_rate": 0.002154985063313347, + "loss": 2.6609, + "mean_token_accuracy": 0.44187673926353455, + "num_tokens": 8910117499.0, + "step": 17430 + }, + { + "epoch": 4.713628988642509, + "grad_norm": 16.375, + "learning_rate": 0.0021546938913721957, + "loss": 1.9591, + "mean_token_accuracy": 0.5749958753585815, + "num_tokens": 8910641757.0, + "step": 17431 + }, + { + "epoch": 4.713899405083829, + "grad_norm": 6.0, + "learning_rate": 0.0021544029908310484, + "loss": 2.7254, + "mean_token_accuracy": 0.44064557552337646, + "num_tokens": 8911158481.0, + "step": 17432 + }, + { + "epoch": 4.714169821525148, + "grad_norm": 2.765625, + "learning_rate": 0.00215411236169883, + "loss": 2.848, + "mean_token_accuracy": 0.43800032138824463, + "num_tokens": 8911644889.0, + "step": 17433 + }, + { + "epoch": 4.7144402379664685, + "grad_norm": 3.28125, + "learning_rate": 0.002153822003984459, + "loss": 2.7493, + "mean_token_accuracy": 0.42752599716186523, + "num_tokens": 8912169073.0, + "step": 17434 + }, + { + "epoch": 4.714710654407788, + "grad_norm": 3.96875, + "learning_rate": 0.002153531917696843, + "loss": 2.7028, + "mean_token_accuracy": 0.4407726526260376, + "num_tokens": 8912691007.0, + "step": 17435 + }, + { + "epoch": 4.714981070849108, + "grad_norm": 3.359375, + "learning_rate": 0.002153242102844884, + "loss": 2.8444, + "mean_token_accuracy": 0.41489508748054504, + "num_tokens": 8913215233.0, + "step": 17436 + }, + { + "epoch": 4.715251487290427, + "grad_norm": 3.875, + "learning_rate": 0.0021529525594374747, + "loss": 2.7295, + "mean_token_accuracy": 0.4386743903160095, + "num_tokens": 8913722404.0, + "step": 17437 + }, + { + "epoch": 4.715521903731747, + "grad_norm": 3.796875, + "learning_rate": 0.0021526632874834975, + "loss": 2.7651, + "mean_token_accuracy": 0.44901716709136963, + "num_tokens": 8914246520.0, + "step": 17438 + }, + { + "epoch": 4.715792320173066, + "grad_norm": 3.875, + "learning_rate": 0.0021523742869918286, + "loss": 2.8904, + "mean_token_accuracy": 0.4474770426750183, + "num_tokens": 8914707923.0, + "step": 17439 + }, + { + "epoch": 4.716062736614386, + "grad_norm": 4.53125, + "learning_rate": 0.0021520855579713364, + "loss": 2.8664, + "mean_token_accuracy": 0.43848496675491333, + "num_tokens": 8915232201.0, + "step": 17440 + }, + { + "epoch": 4.7163331530557056, + "grad_norm": 4.71875, + "learning_rate": 0.0021517971004308793, + "loss": 2.6629, + "mean_token_accuracy": 0.4526621699333191, + "num_tokens": 8915756363.0, + "step": 17441 + }, + { + "epoch": 4.716603569497026, + "grad_norm": 3.984375, + "learning_rate": 0.0021515089143793083, + "loss": 2.6436, + "mean_token_accuracy": 0.4537661671638489, + "num_tokens": 8916280526.0, + "step": 17442 + }, + { + "epoch": 4.716873985938345, + "grad_norm": 3.515625, + "learning_rate": 0.002151220999825465, + "loss": 2.9089, + "mean_token_accuracy": 0.4131624102592468, + "num_tokens": 8916804703.0, + "step": 17443 + }, + { + "epoch": 4.717144402379665, + "grad_norm": 3.75, + "learning_rate": 0.002150933356778184, + "loss": 2.7458, + "mean_token_accuracy": 0.4451659917831421, + "num_tokens": 8917328718.0, + "step": 17444 + }, + { + "epoch": 4.717414818820984, + "grad_norm": 4.25, + "learning_rate": 0.0021506459852462913, + "loss": 2.8308, + "mean_token_accuracy": 0.428857684135437, + "num_tokens": 8917852922.0, + "step": 17445 + }, + { + "epoch": 4.717685235262304, + "grad_norm": 3.4375, + "learning_rate": 0.002150358885238604, + "loss": 2.7092, + "mean_token_accuracy": 0.4574975073337555, + "num_tokens": 8918376926.0, + "step": 17446 + }, + { + "epoch": 4.717955651703623, + "grad_norm": 3.6875, + "learning_rate": 0.002150072056763931, + "loss": 2.6482, + "mean_token_accuracy": 0.43084824085235596, + "num_tokens": 8918873822.0, + "step": 17447 + }, + { + "epoch": 4.7182260681449435, + "grad_norm": 3.578125, + "learning_rate": 0.0021497854998310728, + "loss": 2.6906, + "mean_token_accuracy": 0.4588145613670349, + "num_tokens": 8919376999.0, + "step": 17448 + }, + { + "epoch": 4.718496484586263, + "grad_norm": 3.515625, + "learning_rate": 0.0021494992144488224, + "loss": 2.8118, + "mean_token_accuracy": 0.43806907534599304, + "num_tokens": 8919901274.0, + "step": 17449 + }, + { + "epoch": 4.718766901027583, + "grad_norm": 3.84375, + "learning_rate": 0.0021492132006259647, + "loss": 2.6849, + "mean_token_accuracy": 0.45479875802993774, + "num_tokens": 8920425404.0, + "step": 17450 + }, + { + "epoch": 4.719037317468902, + "grad_norm": 44.5, + "learning_rate": 0.0021489274583712742, + "loss": 2.795, + "mean_token_accuracy": 0.43095868825912476, + "num_tokens": 8920949685.0, + "step": 17451 + }, + { + "epoch": 4.719307733910222, + "grad_norm": 8.1875, + "learning_rate": 0.0021486419876935174, + "loss": 2.6257, + "mean_token_accuracy": 0.44864219427108765, + "num_tokens": 8921473770.0, + "step": 17452 + }, + { + "epoch": 4.719578150351541, + "grad_norm": 2.890625, + "learning_rate": 0.0021483567886014555, + "loss": 2.8707, + "mean_token_accuracy": 0.41896432638168335, + "num_tokens": 8921998041.0, + "step": 17453 + }, + { + "epoch": 4.719848566792861, + "grad_norm": 4.59375, + "learning_rate": 0.002148071861103839, + "loss": 2.6915, + "mean_token_accuracy": 0.45221537351608276, + "num_tokens": 8922522272.0, + "step": 17454 + }, + { + "epoch": 4.7201189832341806, + "grad_norm": 3.90625, + "learning_rate": 0.0021477872052094085, + "loss": 2.7636, + "mean_token_accuracy": 0.4260830283164978, + "num_tokens": 8923046333.0, + "step": 17455 + }, + { + "epoch": 4.720389399675501, + "grad_norm": 4.40625, + "learning_rate": 0.0021475028209268993, + "loss": 2.596, + "mean_token_accuracy": 0.4511013329029083, + "num_tokens": 8923570472.0, + "step": 17456 + }, + { + "epoch": 4.72065981611682, + "grad_norm": 3.203125, + "learning_rate": 0.0021472187082650375, + "loss": 2.5139, + "mean_token_accuracy": 0.46046173572540283, + "num_tokens": 8924094747.0, + "step": 17457 + }, + { + "epoch": 4.720930232558139, + "grad_norm": 4.375, + "learning_rate": 0.0021469348672325393, + "loss": 2.7571, + "mean_token_accuracy": 0.44642043113708496, + "num_tokens": 8924618986.0, + "step": 17458 + }, + { + "epoch": 4.721200648999459, + "grad_norm": 3.484375, + "learning_rate": 0.0021466512978381156, + "loss": 2.8132, + "mean_token_accuracy": 0.4489559233188629, + "num_tokens": 8925084710.0, + "step": 17459 + }, + { + "epoch": 4.721471065440779, + "grad_norm": 3.984375, + "learning_rate": 0.002146368000090466, + "loss": 2.8912, + "mean_token_accuracy": 0.44900715351104736, + "num_tokens": 8925606664.0, + "step": 17460 + }, + { + "epoch": 4.721741481882098, + "grad_norm": 4.40625, + "learning_rate": 0.002146084973998282, + "loss": 2.8385, + "mean_token_accuracy": 0.43040040135383606, + "num_tokens": 8926130905.0, + "step": 17461 + }, + { + "epoch": 4.722011898323418, + "grad_norm": 3.578125, + "learning_rate": 0.0021458022195702487, + "loss": 2.565, + "mean_token_accuracy": 0.48166242241859436, + "num_tokens": 8926592161.0, + "step": 17462 + }, + { + "epoch": 4.722282314764738, + "grad_norm": 4.0, + "learning_rate": 0.0021455197368150424, + "loss": 2.9102, + "mean_token_accuracy": 0.4374493956565857, + "num_tokens": 8927115167.0, + "step": 17463 + }, + { + "epoch": 4.722552731206058, + "grad_norm": 4.84375, + "learning_rate": 0.00214523752574133, + "loss": 2.8274, + "mean_token_accuracy": 0.4366460144519806, + "num_tokens": 8927639237.0, + "step": 17464 + }, + { + "epoch": 4.722823147647377, + "grad_norm": 4.59375, + "learning_rate": 0.002144955586357771, + "loss": 2.7993, + "mean_token_accuracy": 0.4299519956111908, + "num_tokens": 8928163497.0, + "step": 17465 + }, + { + "epoch": 4.723093564088696, + "grad_norm": 3.734375, + "learning_rate": 0.0021446739186730137, + "loss": 2.8992, + "mean_token_accuracy": 0.43697690963745117, + "num_tokens": 8928687703.0, + "step": 17466 + }, + { + "epoch": 4.723363980530016, + "grad_norm": 4.375, + "learning_rate": 0.002144392522695704, + "loss": 2.8076, + "mean_token_accuracy": 0.4517177641391754, + "num_tokens": 8929211895.0, + "step": 17467 + }, + { + "epoch": 4.7236343969713355, + "grad_norm": 4.53125, + "learning_rate": 0.0021441113984344744, + "loss": 2.7033, + "mean_token_accuracy": 0.4660273790359497, + "num_tokens": 8929736181.0, + "step": 17468 + }, + { + "epoch": 4.723904813412656, + "grad_norm": 4.21875, + "learning_rate": 0.0021438305458979503, + "loss": 2.7725, + "mean_token_accuracy": 0.453948438167572, + "num_tokens": 8930260448.0, + "step": 17469 + }, + { + "epoch": 4.724175229853975, + "grad_norm": 5.34375, + "learning_rate": 0.0021435499650947496, + "loss": 2.6993, + "mean_token_accuracy": 0.43736445903778076, + "num_tokens": 8930784626.0, + "step": 17470 + }, + { + "epoch": 4.724445646295295, + "grad_norm": 39.75, + "learning_rate": 0.0021432696560334803, + "loss": 3.0652, + "mean_token_accuracy": 0.3841829001903534, + "num_tokens": 8931308717.0, + "step": 17471 + }, + { + "epoch": 4.724716062736614, + "grad_norm": 6.34375, + "learning_rate": 0.0021429896187227437, + "loss": 2.93, + "mean_token_accuracy": 0.442754328250885, + "num_tokens": 8931759380.0, + "step": 17472 + }, + { + "epoch": 4.724986479177934, + "grad_norm": 3.390625, + "learning_rate": 0.0021427098531711336, + "loss": 2.6738, + "mean_token_accuracy": 0.45168131589889526, + "num_tokens": 8932237760.0, + "step": 17473 + }, + { + "epoch": 4.725256895619253, + "grad_norm": 4.0625, + "learning_rate": 0.002142430359387233, + "loss": 2.8099, + "mean_token_accuracy": 0.4558677077293396, + "num_tokens": 8932714439.0, + "step": 17474 + }, + { + "epoch": 4.725527312060573, + "grad_norm": 3.359375, + "learning_rate": 0.0021421511373796156, + "loss": 2.9561, + "mean_token_accuracy": 0.4391973316669464, + "num_tokens": 8933238707.0, + "step": 17475 + }, + { + "epoch": 4.725797728501893, + "grad_norm": 3.875, + "learning_rate": 0.0021418721871568507, + "loss": 2.6796, + "mean_token_accuracy": 0.447487473487854, + "num_tokens": 8933762886.0, + "step": 17476 + }, + { + "epoch": 4.726068144943213, + "grad_norm": 2.640625, + "learning_rate": 0.002141593508727499, + "loss": 2.7764, + "mean_token_accuracy": 0.4602404832839966, + "num_tokens": 8934207310.0, + "step": 17477 + }, + { + "epoch": 4.726338561384532, + "grad_norm": 3.609375, + "learning_rate": 0.002141315102100107, + "loss": 2.8195, + "mean_token_accuracy": 0.44358518719673157, + "num_tokens": 8934713802.0, + "step": 17478 + }, + { + "epoch": 4.726608977825852, + "grad_norm": 3.8125, + "learning_rate": 0.002141036967283221, + "loss": 2.8235, + "mean_token_accuracy": 0.4297258257865906, + "num_tokens": 8935234180.0, + "step": 17479 + }, + { + "epoch": 4.726879394267171, + "grad_norm": 3.0625, + "learning_rate": 0.0021407591042853736, + "loss": 2.6381, + "mean_token_accuracy": 0.46454542875289917, + "num_tokens": 8935758465.0, + "step": 17480 + }, + { + "epoch": 4.727149810708491, + "grad_norm": 4.4375, + "learning_rate": 0.0021404815131150895, + "loss": 2.7996, + "mean_token_accuracy": 0.4497705101966858, + "num_tokens": 8936282676.0, + "step": 17481 + }, + { + "epoch": 4.7274202271498105, + "grad_norm": 4.5, + "learning_rate": 0.002140204193780887, + "loss": 2.7287, + "mean_token_accuracy": 0.43227696418762207, + "num_tokens": 8936806808.0, + "step": 17482 + }, + { + "epoch": 4.727690643591131, + "grad_norm": 2.90625, + "learning_rate": 0.002139927146291274, + "loss": 2.857, + "mean_token_accuracy": 0.4482569098472595, + "num_tokens": 8937330984.0, + "step": 17483 + }, + { + "epoch": 4.72796106003245, + "grad_norm": 4.125, + "learning_rate": 0.002139650370654753, + "loss": 2.8441, + "mean_token_accuracy": 0.4407593011856079, + "num_tokens": 8937801441.0, + "step": 17484 + }, + { + "epoch": 4.72823147647377, + "grad_norm": 4.09375, + "learning_rate": 0.0021393738668798153, + "loss": 2.8565, + "mean_token_accuracy": 0.4048198163509369, + "num_tokens": 8938325504.0, + "step": 17485 + }, + { + "epoch": 4.728501892915089, + "grad_norm": 4.09375, + "learning_rate": 0.0021390976349749432, + "loss": 2.6586, + "mean_token_accuracy": 0.4617912173271179, + "num_tokens": 8938822594.0, + "step": 17486 + }, + { + "epoch": 4.728772309356409, + "grad_norm": 4.1875, + "learning_rate": 0.002138821674948615, + "loss": 2.7269, + "mean_token_accuracy": 0.4618413746356964, + "num_tokens": 8939335519.0, + "step": 17487 + }, + { + "epoch": 4.729042725797728, + "grad_norm": 4.03125, + "learning_rate": 0.002138545986809298, + "loss": 2.75, + "mean_token_accuracy": 0.4616309106349945, + "num_tokens": 8939799436.0, + "step": 17488 + }, + { + "epoch": 4.729313142239048, + "grad_norm": 4.0, + "learning_rate": 0.0021382705705654476, + "loss": 2.8489, + "mean_token_accuracy": 0.4334900677204132, + "num_tokens": 8940323604.0, + "step": 17489 + }, + { + "epoch": 4.729583558680368, + "grad_norm": 4.75, + "learning_rate": 0.0021379954262255177, + "loss": 2.8055, + "mean_token_accuracy": 0.4449250102043152, + "num_tokens": 8940847757.0, + "step": 17490 + }, + { + "epoch": 4.729853975121688, + "grad_norm": 90.0, + "learning_rate": 0.00213772055379795, + "loss": 5.7307, + "mean_token_accuracy": 0.20341181755065918, + "num_tokens": 8941372004.0, + "step": 17491 + }, + { + "epoch": 4.730124391563007, + "grad_norm": 7.125, + "learning_rate": 0.0021374459532911773, + "loss": 2.6975, + "mean_token_accuracy": 0.45602652430534363, + "num_tokens": 8941855091.0, + "step": 17492 + }, + { + "epoch": 4.730394808004327, + "grad_norm": 3.0, + "learning_rate": 0.002137171624713626, + "loss": 2.6326, + "mean_token_accuracy": 0.43889960646629333, + "num_tokens": 8942379365.0, + "step": 17493 + }, + { + "epoch": 4.730665224445646, + "grad_norm": 3.984375, + "learning_rate": 0.0021368975680737144, + "loss": 2.7543, + "mean_token_accuracy": 0.444383829832077, + "num_tokens": 8942903514.0, + "step": 17494 + }, + { + "epoch": 4.730935640886966, + "grad_norm": 2.90625, + "learning_rate": 0.0021366237833798482, + "loss": 2.7717, + "mean_token_accuracy": 0.4304826855659485, + "num_tokens": 8943427720.0, + "step": 17495 + }, + { + "epoch": 4.7312060573282855, + "grad_norm": 3.34375, + "learning_rate": 0.002136350270640431, + "loss": 2.6699, + "mean_token_accuracy": 0.4365992248058319, + "num_tokens": 8943951902.0, + "step": 17496 + }, + { + "epoch": 4.731476473769606, + "grad_norm": 3.28125, + "learning_rate": 0.0021360770298638533, + "loss": 2.8342, + "mean_token_accuracy": 0.4263167083263397, + "num_tokens": 8944476161.0, + "step": 17497 + }, + { + "epoch": 4.731746890210925, + "grad_norm": 3.875, + "learning_rate": 0.0021358040610585, + "loss": 2.8559, + "mean_token_accuracy": 0.4195827841758728, + "num_tokens": 8945000423.0, + "step": 17498 + }, + { + "epoch": 4.732017306652244, + "grad_norm": 3.8125, + "learning_rate": 0.0021355313642327455, + "loss": 2.8665, + "mean_token_accuracy": 0.4397290050983429, + "num_tokens": 8945496127.0, + "step": 17499 + }, + { + "epoch": 4.732287723093564, + "grad_norm": 4.28125, + "learning_rate": 0.002135258939394958, + "loss": 2.6493, + "mean_token_accuracy": 0.44257140159606934, + "num_tokens": 8945979639.0, + "step": 17500 + }, + { + "epoch": 4.732558139534884, + "grad_norm": 3.890625, + "learning_rate": 0.0021349867865534944, + "loss": 2.6955, + "mean_token_accuracy": 0.4448488652706146, + "num_tokens": 8946498053.0, + "step": 17501 + }, + { + "epoch": 4.732828555976203, + "grad_norm": 3.46875, + "learning_rate": 0.002134714905716708, + "loss": 2.6229, + "mean_token_accuracy": 0.5093902349472046, + "num_tokens": 8946957170.0, + "step": 17502 + }, + { + "epoch": 4.7330989724175225, + "grad_norm": 4.6875, + "learning_rate": 0.0021344432968929384, + "loss": 2.8246, + "mean_token_accuracy": 0.4493178725242615, + "num_tokens": 8947481452.0, + "step": 17503 + }, + { + "epoch": 4.733369388858843, + "grad_norm": 3.375, + "learning_rate": 0.002134171960090521, + "loss": 2.7493, + "mean_token_accuracy": 0.4517280161380768, + "num_tokens": 8948005732.0, + "step": 17504 + }, + { + "epoch": 4.733639805300163, + "grad_norm": 3.90625, + "learning_rate": 0.0021339008953177798, + "loss": 2.7783, + "mean_token_accuracy": 0.44669032096862793, + "num_tokens": 8948529823.0, + "step": 17505 + }, + { + "epoch": 4.733910221741482, + "grad_norm": 4.125, + "learning_rate": 0.0021336301025830333, + "loss": 2.7414, + "mean_token_accuracy": 0.4517454504966736, + "num_tokens": 8949053923.0, + "step": 17506 + }, + { + "epoch": 4.734180638182801, + "grad_norm": 3.546875, + "learning_rate": 0.002133359581894589, + "loss": 2.8838, + "mean_token_accuracy": 0.45169132947921753, + "num_tokens": 8949565579.0, + "step": 17507 + }, + { + "epoch": 4.734451054624121, + "grad_norm": 3.515625, + "learning_rate": 0.002133089333260749, + "loss": 2.5964, + "mean_token_accuracy": 0.45288941264152527, + "num_tokens": 8950089717.0, + "step": 17508 + }, + { + "epoch": 4.734721471065441, + "grad_norm": 3.5, + "learning_rate": 0.002132819356689803, + "loss": 2.7699, + "mean_token_accuracy": 0.4502689838409424, + "num_tokens": 8950581853.0, + "step": 17509 + }, + { + "epoch": 4.7349918875067605, + "grad_norm": 4.0, + "learning_rate": 0.0021325496521900364, + "loss": 2.6491, + "mean_token_accuracy": 0.4398154020309448, + "num_tokens": 8951106127.0, + "step": 17510 + }, + { + "epoch": 4.73526230394808, + "grad_norm": 19.25, + "learning_rate": 0.002132280219769724, + "loss": 2.3866, + "mean_token_accuracy": 0.48749521374702454, + "num_tokens": 8951624254.0, + "step": 17511 + }, + { + "epoch": 4.7355327203894, + "grad_norm": 7.03125, + "learning_rate": 0.0021320110594371315, + "loss": 2.8173, + "mean_token_accuracy": 0.4430212676525116, + "num_tokens": 8952148524.0, + "step": 17512 + }, + { + "epoch": 4.735803136830719, + "grad_norm": 3.0625, + "learning_rate": 0.00213174217120052, + "loss": 2.8252, + "mean_token_accuracy": 0.43969613313674927, + "num_tokens": 8952672515.0, + "step": 17513 + }, + { + "epoch": 4.736073553272039, + "grad_norm": 4.1875, + "learning_rate": 0.002131473555068139, + "loss": 2.5909, + "mean_token_accuracy": 0.4620107412338257, + "num_tokens": 8953196700.0, + "step": 17514 + }, + { + "epoch": 4.736343969713358, + "grad_norm": 3.765625, + "learning_rate": 0.002131205211048228, + "loss": 2.8534, + "mean_token_accuracy": 0.445334255695343, + "num_tokens": 8953670943.0, + "step": 17515 + }, + { + "epoch": 4.736614386154678, + "grad_norm": 3.8125, + "learning_rate": 0.0021309371391490246, + "loss": 2.6475, + "mean_token_accuracy": 0.44762200117111206, + "num_tokens": 8954195068.0, + "step": 17516 + }, + { + "epoch": 4.7368848025959975, + "grad_norm": 3.8125, + "learning_rate": 0.002130669339378751, + "loss": 2.8071, + "mean_token_accuracy": 0.4478350281715393, + "num_tokens": 8954719199.0, + "step": 17517 + }, + { + "epoch": 4.737155219037318, + "grad_norm": 3.984375, + "learning_rate": 0.002130401811745625, + "loss": 2.7625, + "mean_token_accuracy": 0.44407808780670166, + "num_tokens": 8955241769.0, + "step": 17518 + }, + { + "epoch": 4.737425635478637, + "grad_norm": 4.4375, + "learning_rate": 0.0021301345562578554, + "loss": 2.837, + "mean_token_accuracy": 0.4252643585205078, + "num_tokens": 8955766039.0, + "step": 17519 + }, + { + "epoch": 4.737696051919957, + "grad_norm": 3.921875, + "learning_rate": 0.002129867572923642, + "loss": 2.7294, + "mean_token_accuracy": 0.44444966316223145, + "num_tokens": 8956290324.0, + "step": 17520 + }, + { + "epoch": 4.737966468361276, + "grad_norm": 3.59375, + "learning_rate": 0.002129600861751176, + "loss": 2.7538, + "mean_token_accuracy": 0.45998644828796387, + "num_tokens": 8956770726.0, + "step": 17521 + }, + { + "epoch": 4.738236884802596, + "grad_norm": 4.40625, + "learning_rate": 0.0021293344227486435, + "loss": 2.8393, + "mean_token_accuracy": 0.4512399435043335, + "num_tokens": 8957294997.0, + "step": 17522 + }, + { + "epoch": 4.738507301243915, + "grad_norm": 3.90625, + "learning_rate": 0.0021290682559242176, + "loss": 2.6778, + "mean_token_accuracy": 0.4527278542518616, + "num_tokens": 8957819275.0, + "step": 17523 + }, + { + "epoch": 4.7387777176852355, + "grad_norm": 4.15625, + "learning_rate": 0.0021288023612860644, + "loss": 2.56, + "mean_token_accuracy": 0.4358789026737213, + "num_tokens": 8958343471.0, + "step": 17524 + }, + { + "epoch": 4.739048134126555, + "grad_norm": 3.453125, + "learning_rate": 0.002128536738842344, + "loss": 2.8098, + "mean_token_accuracy": 0.4460698366165161, + "num_tokens": 8958867747.0, + "step": 17525 + }, + { + "epoch": 4.739318550567875, + "grad_norm": 4.21875, + "learning_rate": 0.002128271388601205, + "loss": 2.7006, + "mean_token_accuracy": 0.453091561794281, + "num_tokens": 8959349916.0, + "step": 17526 + }, + { + "epoch": 4.739588967009194, + "grad_norm": 3.171875, + "learning_rate": 0.0021280063105707904, + "loss": 2.7249, + "mean_token_accuracy": 0.44252777099609375, + "num_tokens": 8959874142.0, + "step": 17527 + }, + { + "epoch": 4.739859383450514, + "grad_norm": 4.28125, + "learning_rate": 0.002127741504759233, + "loss": 2.759, + "mean_token_accuracy": 0.4218951165676117, + "num_tokens": 8960398372.0, + "step": 17528 + }, + { + "epoch": 4.740129799891833, + "grad_norm": 2.9375, + "learning_rate": 0.0021274769711746573, + "loss": 2.8582, + "mean_token_accuracy": 0.41744375228881836, + "num_tokens": 8960922578.0, + "step": 17529 + }, + { + "epoch": 4.740400216333153, + "grad_norm": 3.671875, + "learning_rate": 0.0021272127098251816, + "loss": 2.6036, + "mean_token_accuracy": 0.4510155916213989, + "num_tokens": 8961355539.0, + "step": 17530 + }, + { + "epoch": 4.7406706327744725, + "grad_norm": 57.25, + "learning_rate": 0.0021269487207189125, + "loss": 2.8387, + "mean_token_accuracy": 0.4531252086162567, + "num_tokens": 8961826700.0, + "step": 17531 + }, + { + "epoch": 4.740941049215793, + "grad_norm": 7.53125, + "learning_rate": 0.00212668500386395, + "loss": 2.8028, + "mean_token_accuracy": 0.42392370104789734, + "num_tokens": 8962350977.0, + "step": 17532 + }, + { + "epoch": 4.741211465657112, + "grad_norm": 3.078125, + "learning_rate": 0.0021264215592683862, + "loss": 2.823, + "mean_token_accuracy": 0.4630015790462494, + "num_tokens": 8962816434.0, + "step": 17533 + }, + { + "epoch": 4.741481882098432, + "grad_norm": 3.515625, + "learning_rate": 0.0021261583869403057, + "loss": 2.8637, + "mean_token_accuracy": 0.4410434365272522, + "num_tokens": 8963327364.0, + "step": 17534 + }, + { + "epoch": 4.741752298539751, + "grad_norm": 3.75, + "learning_rate": 0.0021258954868877806, + "loss": 2.7185, + "mean_token_accuracy": 0.471446692943573, + "num_tokens": 8963792398.0, + "step": 17535 + }, + { + "epoch": 4.742022714981071, + "grad_norm": 3.796875, + "learning_rate": 0.0021256328591188808, + "loss": 2.9472, + "mean_token_accuracy": 0.4386156499385834, + "num_tokens": 8964316579.0, + "step": 17536 + }, + { + "epoch": 4.74229313142239, + "grad_norm": 3.4375, + "learning_rate": 0.0021253705036416615, + "loss": 2.5527, + "mean_token_accuracy": 0.4646742343902588, + "num_tokens": 8964840771.0, + "step": 17537 + }, + { + "epoch": 4.7425635478637105, + "grad_norm": 3.78125, + "learning_rate": 0.002125108420464173, + "loss": 2.6865, + "mean_token_accuracy": 0.45295238494873047, + "num_tokens": 8965364949.0, + "step": 17538 + }, + { + "epoch": 4.74283396430503, + "grad_norm": 4.28125, + "learning_rate": 0.0021248466095944592, + "loss": 2.8084, + "mean_token_accuracy": 0.4564082622528076, + "num_tokens": 8965889209.0, + "step": 17539 + }, + { + "epoch": 4.743104380746349, + "grad_norm": 4.15625, + "learning_rate": 0.0021245850710405515, + "loss": 2.6965, + "mean_token_accuracy": 0.4358089566230774, + "num_tokens": 8966413418.0, + "step": 17540 + }, + { + "epoch": 4.743374797187669, + "grad_norm": 3.3125, + "learning_rate": 0.002124323804810474, + "loss": 2.6602, + "mean_token_accuracy": 0.44832974672317505, + "num_tokens": 8966937620.0, + "step": 17541 + }, + { + "epoch": 4.743645213628989, + "grad_norm": 4.0625, + "learning_rate": 0.0021240628109122435, + "loss": 2.8051, + "mean_token_accuracy": 0.46199026703834534, + "num_tokens": 8967395863.0, + "step": 17542 + }, + { + "epoch": 4.743915630070308, + "grad_norm": 3.953125, + "learning_rate": 0.00212380208935387, + "loss": 2.7526, + "mean_token_accuracy": 0.45000240206718445, + "num_tokens": 8967920138.0, + "step": 17543 + }, + { + "epoch": 4.7441860465116275, + "grad_norm": 4.15625, + "learning_rate": 0.0021235416401433494, + "loss": 2.6643, + "mean_token_accuracy": 0.4566996693611145, + "num_tokens": 8968444235.0, + "step": 17544 + }, + { + "epoch": 4.7444564629529475, + "grad_norm": 4.78125, + "learning_rate": 0.0021232814632886764, + "loss": 2.7775, + "mean_token_accuracy": 0.4250306189060211, + "num_tokens": 8968968470.0, + "step": 17545 + }, + { + "epoch": 4.744726879394268, + "grad_norm": 3.5, + "learning_rate": 0.0021230215587978323, + "loss": 2.9659, + "mean_token_accuracy": 0.427399218082428, + "num_tokens": 8969492656.0, + "step": 17546 + }, + { + "epoch": 4.744997295835587, + "grad_norm": 3.640625, + "learning_rate": 0.0021227619266787935, + "loss": 2.7887, + "mean_token_accuracy": 0.4336794912815094, + "num_tokens": 8970016928.0, + "step": 17547 + }, + { + "epoch": 4.745267712276906, + "grad_norm": 4.0625, + "learning_rate": 0.002122502566939523, + "loss": 2.7207, + "mean_token_accuracy": 0.4358571171760559, + "num_tokens": 8970541129.0, + "step": 17548 + }, + { + "epoch": 4.745538128718226, + "grad_norm": 3.578125, + "learning_rate": 0.0021222434795879833, + "loss": 2.6297, + "mean_token_accuracy": 0.47029930353164673, + "num_tokens": 8970999745.0, + "step": 17549 + }, + { + "epoch": 4.745808545159546, + "grad_norm": 4.25, + "learning_rate": 0.0021219846646321197, + "loss": 2.82, + "mean_token_accuracy": 0.4473223090171814, + "num_tokens": 8971473629.0, + "step": 17550 + }, + { + "epoch": 4.746078961600865, + "grad_norm": 19.25, + "learning_rate": 0.0021217261220798753, + "loss": 2.393, + "mean_token_accuracy": 0.4980822801589966, + "num_tokens": 8971997910.0, + "step": 17551 + }, + { + "epoch": 4.746349378042185, + "grad_norm": 7.90625, + "learning_rate": 0.0021214678519391834, + "loss": 2.9135, + "mean_token_accuracy": 0.44166916608810425, + "num_tokens": 8972462689.0, + "step": 17552 + }, + { + "epoch": 4.746619794483505, + "grad_norm": 3.4375, + "learning_rate": 0.0021212098542179674, + "loss": 2.5548, + "mean_token_accuracy": 0.44475415349006653, + "num_tokens": 8972986824.0, + "step": 17553 + }, + { + "epoch": 4.746890210924824, + "grad_norm": 4.53125, + "learning_rate": 0.002120952128924144, + "loss": 2.5337, + "mean_token_accuracy": 0.4734826982021332, + "num_tokens": 8973510948.0, + "step": 17554 + }, + { + "epoch": 4.747160627366144, + "grad_norm": 45.5, + "learning_rate": 0.00212069467606562, + "loss": 3.0519, + "mean_token_accuracy": 0.4301477074623108, + "num_tokens": 8974035168.0, + "step": 17555 + }, + { + "epoch": 4.747431043807463, + "grad_norm": 6.46875, + "learning_rate": 0.0021204374956502975, + "loss": 2.8878, + "mean_token_accuracy": 0.44119852781295776, + "num_tokens": 8974559397.0, + "step": 17556 + }, + { + "epoch": 4.747701460248783, + "grad_norm": 4.78125, + "learning_rate": 0.002120180587686065, + "loss": 2.986, + "mean_token_accuracy": 0.4248051643371582, + "num_tokens": 8975040996.0, + "step": 17557 + }, + { + "epoch": 4.7479718766901025, + "grad_norm": 3.765625, + "learning_rate": 0.0021199239521808052, + "loss": 2.7771, + "mean_token_accuracy": 0.44045642018318176, + "num_tokens": 8975565209.0, + "step": 17558 + }, + { + "epoch": 4.7482422931314225, + "grad_norm": 4.0625, + "learning_rate": 0.002119667589142394, + "loss": 2.9299, + "mean_token_accuracy": 0.40786153078079224, + "num_tokens": 8976089461.0, + "step": 17559 + }, + { + "epoch": 4.748512709572742, + "grad_norm": 9.25, + "learning_rate": 0.002119411498578696, + "loss": 2.5893, + "mean_token_accuracy": 0.48095858097076416, + "num_tokens": 8976613534.0, + "step": 17560 + }, + { + "epoch": 4.748783126014062, + "grad_norm": 4.03125, + "learning_rate": 0.00211915568049757, + "loss": 2.9417, + "mean_token_accuracy": 0.4274877607822418, + "num_tokens": 8977137541.0, + "step": 17561 + }, + { + "epoch": 4.749053542455381, + "grad_norm": 4.0, + "learning_rate": 0.0021189001349068634, + "loss": 2.8747, + "mean_token_accuracy": 0.4501197934150696, + "num_tokens": 8977645643.0, + "step": 17562 + }, + { + "epoch": 4.749323958896701, + "grad_norm": 4.1875, + "learning_rate": 0.002118644861814419, + "loss": 2.5417, + "mean_token_accuracy": 0.474629670381546, + "num_tokens": 8978169926.0, + "step": 17563 + }, + { + "epoch": 4.74959437533802, + "grad_norm": 4.0625, + "learning_rate": 0.0021183898612280676, + "loss": 2.8079, + "mean_token_accuracy": 0.4365267753601074, + "num_tokens": 8978694133.0, + "step": 17564 + }, + { + "epoch": 4.74986479177934, + "grad_norm": 3.984375, + "learning_rate": 0.002118135133155636, + "loss": 2.8941, + "mean_token_accuracy": 0.42798370122909546, + "num_tokens": 8979218415.0, + "step": 17565 + }, + { + "epoch": 4.75013520822066, + "grad_norm": 5.3125, + "learning_rate": 0.0021178806776049382, + "loss": 2.7636, + "mean_token_accuracy": 0.43094733357429504, + "num_tokens": 8979738147.0, + "step": 17566 + }, + { + "epoch": 4.75040562466198, + "grad_norm": 3.1875, + "learning_rate": 0.0021176264945837802, + "loss": 2.6445, + "mean_token_accuracy": 0.4494095742702484, + "num_tokens": 8980221380.0, + "step": 17567 + }, + { + "epoch": 4.750676041103299, + "grad_norm": 3.546875, + "learning_rate": 0.0021173725840999648, + "loss": 2.7872, + "mean_token_accuracy": 0.45323646068573, + "num_tokens": 8980745559.0, + "step": 17568 + }, + { + "epoch": 4.750946457544619, + "grad_norm": 4.0625, + "learning_rate": 0.00211711894616128, + "loss": 2.7992, + "mean_token_accuracy": 0.4382838010787964, + "num_tokens": 8981269633.0, + "step": 17569 + }, + { + "epoch": 4.751216873985938, + "grad_norm": 3.5625, + "learning_rate": 0.002116865580775507, + "loss": 2.7165, + "mean_token_accuracy": 0.45871061086654663, + "num_tokens": 8981785401.0, + "step": 17570 + }, + { + "epoch": 4.751487290427258, + "grad_norm": 41.0, + "learning_rate": 0.0021166124879504235, + "loss": 3.0416, + "mean_token_accuracy": 0.43732792139053345, + "num_tokens": 8982309675.0, + "step": 17571 + }, + { + "epoch": 4.7517577068685775, + "grad_norm": 6.5, + "learning_rate": 0.0021163596676937934, + "loss": 2.586, + "mean_token_accuracy": 0.4681971073150635, + "num_tokens": 8982797044.0, + "step": 17572 + }, + { + "epoch": 4.7520281233098975, + "grad_norm": 4.0625, + "learning_rate": 0.0021161071200133724, + "loss": 2.9428, + "mean_token_accuracy": 0.44976872205734253, + "num_tokens": 8983261782.0, + "step": 17573 + }, + { + "epoch": 4.752298539751217, + "grad_norm": 4.375, + "learning_rate": 0.0021158548449169124, + "loss": 2.8417, + "mean_token_accuracy": 0.45760810375213623, + "num_tokens": 8983725521.0, + "step": 17574 + }, + { + "epoch": 4.752568956192537, + "grad_norm": 3.109375, + "learning_rate": 0.0021156028424121507, + "loss": 2.6983, + "mean_token_accuracy": 0.45179152488708496, + "num_tokens": 8984249738.0, + "step": 17575 + }, + { + "epoch": 4.752839372633856, + "grad_norm": 4.625, + "learning_rate": 0.0021153511125068213, + "loss": 2.6255, + "mean_token_accuracy": 0.44278618693351746, + "num_tokens": 8984773761.0, + "step": 17576 + }, + { + "epoch": 4.753109789075176, + "grad_norm": 4.5, + "learning_rate": 0.0021150996552086482, + "loss": 2.7602, + "mean_token_accuracy": 0.4371234178543091, + "num_tokens": 8985240553.0, + "step": 17577 + }, + { + "epoch": 4.753380205516495, + "grad_norm": 3.890625, + "learning_rate": 0.0021148484705253473, + "loss": 2.8022, + "mean_token_accuracy": 0.45678240060806274, + "num_tokens": 8985764837.0, + "step": 17578 + }, + { + "epoch": 4.753650621957815, + "grad_norm": 4.84375, + "learning_rate": 0.0021145975584646237, + "loss": 2.8211, + "mean_token_accuracy": 0.4417879581451416, + "num_tokens": 8986289082.0, + "step": 17579 + }, + { + "epoch": 4.753921038399135, + "grad_norm": 3.59375, + "learning_rate": 0.0021143469190341773, + "loss": 2.6531, + "mean_token_accuracy": 0.4385613203048706, + "num_tokens": 8986803169.0, + "step": 17580 + }, + { + "epoch": 4.754191454840454, + "grad_norm": 3.578125, + "learning_rate": 0.002114096552241699, + "loss": 2.7063, + "mean_token_accuracy": 0.4496864676475525, + "num_tokens": 8987313152.0, + "step": 17581 + }, + { + "epoch": 4.754461871281774, + "grad_norm": 3.421875, + "learning_rate": 0.0021138464580948706, + "loss": 2.5876, + "mean_token_accuracy": 0.42993611097335815, + "num_tokens": 8987837425.0, + "step": 17582 + }, + { + "epoch": 4.754732287723094, + "grad_norm": 2.75, + "learning_rate": 0.002113596636601364, + "loss": 2.7428, + "mean_token_accuracy": 0.4492235481739044, + "num_tokens": 8988361499.0, + "step": 17583 + }, + { + "epoch": 4.755002704164413, + "grad_norm": 3.328125, + "learning_rate": 0.002113347087768847, + "loss": 2.6785, + "mean_token_accuracy": 0.4519834518432617, + "num_tokens": 8988885770.0, + "step": 17584 + }, + { + "epoch": 4.755273120605732, + "grad_norm": 4.125, + "learning_rate": 0.0021130978116049755, + "loss": 2.6983, + "mean_token_accuracy": 0.4337711036205292, + "num_tokens": 8989410041.0, + "step": 17585 + }, + { + "epoch": 4.7555435370470525, + "grad_norm": 3.5625, + "learning_rate": 0.0021128488081173973, + "loss": 2.5884, + "mean_token_accuracy": 0.4645840525627136, + "num_tokens": 8989917369.0, + "step": 17586 + }, + { + "epoch": 4.7558139534883725, + "grad_norm": 4.46875, + "learning_rate": 0.0021126000773137527, + "loss": 2.5803, + "mean_token_accuracy": 0.4589099884033203, + "num_tokens": 8990441653.0, + "step": 17587 + }, + { + "epoch": 4.756084369929692, + "grad_norm": 4.5625, + "learning_rate": 0.0021123516192016745, + "loss": 2.7782, + "mean_token_accuracy": 0.46883130073547363, + "num_tokens": 8990900397.0, + "step": 17588 + }, + { + "epoch": 4.756354786371011, + "grad_norm": 3.546875, + "learning_rate": 0.0021121034337887846, + "loss": 2.7072, + "mean_token_accuracy": 0.44575637578964233, + "num_tokens": 8991424569.0, + "step": 17589 + }, + { + "epoch": 4.756625202812331, + "grad_norm": 3.6875, + "learning_rate": 0.0021118555210826997, + "loss": 2.8941, + "mean_token_accuracy": 0.4266723394393921, + "num_tokens": 8991948842.0, + "step": 17590 + }, + { + "epoch": 4.756895619253651, + "grad_norm": 50.5, + "learning_rate": 0.0021116078810910245, + "loss": 2.4105, + "mean_token_accuracy": 0.4956669807434082, + "num_tokens": 8992473090.0, + "step": 17591 + }, + { + "epoch": 4.75716603569497, + "grad_norm": 8.5625, + "learning_rate": 0.0021113605138213605, + "loss": 2.8882, + "mean_token_accuracy": 0.43699803948402405, + "num_tokens": 8992997315.0, + "step": 17592 + }, + { + "epoch": 4.7574364521362895, + "grad_norm": 2.59375, + "learning_rate": 0.0021111134192812945, + "loss": 2.5992, + "mean_token_accuracy": 0.466561496257782, + "num_tokens": 8993521511.0, + "step": 17593 + }, + { + "epoch": 4.75770686857761, + "grad_norm": 3.84375, + "learning_rate": 0.0021108665974784095, + "loss": 2.6836, + "mean_token_accuracy": 0.4280199408531189, + "num_tokens": 8994045654.0, + "step": 17594 + }, + { + "epoch": 4.757977285018929, + "grad_norm": 3.40625, + "learning_rate": 0.002110620048420279, + "loss": 2.7473, + "mean_token_accuracy": 0.4452378451824188, + "num_tokens": 8994569857.0, + "step": 17595 + }, + { + "epoch": 4.758247701460249, + "grad_norm": 3.359375, + "learning_rate": 0.0021103737721144666, + "loss": 2.8019, + "mean_token_accuracy": 0.4442571997642517, + "num_tokens": 8995094040.0, + "step": 17596 + }, + { + "epoch": 4.758518117901568, + "grad_norm": 4.71875, + "learning_rate": 0.0021101277685685307, + "loss": 2.6009, + "mean_token_accuracy": 0.45337286591529846, + "num_tokens": 8995570628.0, + "step": 17597 + }, + { + "epoch": 4.758788534342888, + "grad_norm": 3.296875, + "learning_rate": 0.0021098820377900176, + "loss": 2.9008, + "mean_token_accuracy": 0.45043307542800903, + "num_tokens": 8996064849.0, + "step": 17598 + }, + { + "epoch": 4.759058950784207, + "grad_norm": 5.25, + "learning_rate": 0.0021096365797864673, + "loss": 2.7922, + "mean_token_accuracy": 0.454129159450531, + "num_tokens": 8996570546.0, + "step": 17599 + }, + { + "epoch": 4.7593293672255275, + "grad_norm": 4.34375, + "learning_rate": 0.0021093913945654124, + "loss": 2.8678, + "mean_token_accuracy": 0.4426485002040863, + "num_tokens": 8997094653.0, + "step": 17600 + }, + { + "epoch": 4.759599783666847, + "grad_norm": 6.375, + "learning_rate": 0.002109146482134375, + "loss": 2.6178, + "mean_token_accuracy": 0.4445299506187439, + "num_tokens": 8997618861.0, + "step": 17601 + }, + { + "epoch": 4.759870200108167, + "grad_norm": 4.71875, + "learning_rate": 0.0021089018425008692, + "loss": 2.8113, + "mean_token_accuracy": 0.4560354948043823, + "num_tokens": 8998143045.0, + "step": 17602 + }, + { + "epoch": 4.760140616549486, + "grad_norm": 3.0625, + "learning_rate": 0.0021086574756724015, + "loss": 2.6755, + "mean_token_accuracy": 0.45359140634536743, + "num_tokens": 8998635687.0, + "step": 17603 + }, + { + "epoch": 4.760411032990806, + "grad_norm": 4.0625, + "learning_rate": 0.0021084133816564707, + "loss": 2.7701, + "mean_token_accuracy": 0.4364836812019348, + "num_tokens": 8999159794.0, + "step": 17604 + }, + { + "epoch": 4.760681449432125, + "grad_norm": 3.171875, + "learning_rate": 0.0021081695604605656, + "loss": 2.7969, + "mean_token_accuracy": 0.4464258551597595, + "num_tokens": 8999684003.0, + "step": 17605 + }, + { + "epoch": 4.760951865873445, + "grad_norm": 3.421875, + "learning_rate": 0.002107926012092169, + "loss": 2.7655, + "mean_token_accuracy": 0.4270867109298706, + "num_tokens": 9000208275.0, + "step": 17606 + }, + { + "epoch": 4.7612222823147645, + "grad_norm": 3.484375, + "learning_rate": 0.0021076827365587507, + "loss": 2.8564, + "mean_token_accuracy": 0.4366493821144104, + "num_tokens": 9000732543.0, + "step": 17607 + }, + { + "epoch": 4.761492698756085, + "grad_norm": 4.28125, + "learning_rate": 0.0021074397338677773, + "loss": 2.7961, + "mean_token_accuracy": 0.4511539340019226, + "num_tokens": 9001201360.0, + "step": 17608 + }, + { + "epoch": 4.761763115197404, + "grad_norm": 3.875, + "learning_rate": 0.0021071970040267038, + "loss": 2.9824, + "mean_token_accuracy": 0.41582512855529785, + "num_tokens": 9001725584.0, + "step": 17609 + }, + { + "epoch": 4.762033531638724, + "grad_norm": 3.96875, + "learning_rate": 0.0021069545470429785, + "loss": 2.5916, + "mean_token_accuracy": 0.46384397149086, + "num_tokens": 9002249723.0, + "step": 17610 + }, + { + "epoch": 4.762303948080043, + "grad_norm": 45.5, + "learning_rate": 0.0021067123629240404, + "loss": 2.8446, + "mean_token_accuracy": 0.4925338923931122, + "num_tokens": 9002756211.0, + "step": 17611 + }, + { + "epoch": 4.762574364521363, + "grad_norm": 7.71875, + "learning_rate": 0.002106470451677321, + "loss": 2.785, + "mean_token_accuracy": 0.43970000743865967, + "num_tokens": 9003280476.0, + "step": 17612 + }, + { + "epoch": 4.762844780962682, + "grad_norm": 9.75, + "learning_rate": 0.002106228813310241, + "loss": 2.6766, + "mean_token_accuracy": 0.48280981183052063, + "num_tokens": 9003792848.0, + "step": 17613 + }, + { + "epoch": 4.7631151974040025, + "grad_norm": 4.03125, + "learning_rate": 0.0021059874478302166, + "loss": 2.696, + "mean_token_accuracy": 0.45140475034713745, + "num_tokens": 9004264783.0, + "step": 17614 + }, + { + "epoch": 4.763385613845322, + "grad_norm": 4.21875, + "learning_rate": 0.002105746355244653, + "loss": 2.7953, + "mean_token_accuracy": 0.4289190173149109, + "num_tokens": 9004789027.0, + "step": 17615 + }, + { + "epoch": 4.763656030286642, + "grad_norm": 3.859375, + "learning_rate": 0.0021055055355609477, + "loss": 2.9364, + "mean_token_accuracy": 0.43309369683265686, + "num_tokens": 9005313301.0, + "step": 17616 + }, + { + "epoch": 4.763926446727961, + "grad_norm": 3.921875, + "learning_rate": 0.0021052649887864892, + "loss": 2.7422, + "mean_token_accuracy": 0.4469190835952759, + "num_tokens": 9005836392.0, + "step": 17617 + }, + { + "epoch": 4.764196863169281, + "grad_norm": 4.875, + "learning_rate": 0.0021050247149286584, + "loss": 2.7182, + "mean_token_accuracy": 0.4776744246482849, + "num_tokens": 9006279471.0, + "step": 17618 + }, + { + "epoch": 4.7644672796106, + "grad_norm": 4.1875, + "learning_rate": 0.0021047847139948274, + "loss": 2.7981, + "mean_token_accuracy": 0.4382161796092987, + "num_tokens": 9006754337.0, + "step": 17619 + }, + { + "epoch": 4.76473769605192, + "grad_norm": 4.03125, + "learning_rate": 0.0021045449859923614, + "loss": 2.8779, + "mean_token_accuracy": 0.4446769952774048, + "num_tokens": 9007230147.0, + "step": 17620 + }, + { + "epoch": 4.7650081124932395, + "grad_norm": 4.15625, + "learning_rate": 0.002104305530928615, + "loss": 2.7455, + "mean_token_accuracy": 0.44261932373046875, + "num_tokens": 9007696826.0, + "step": 17621 + }, + { + "epoch": 4.765278528934559, + "grad_norm": 3.421875, + "learning_rate": 0.0021040663488109354, + "loss": 2.8224, + "mean_token_accuracy": 0.43678587675094604, + "num_tokens": 9008221017.0, + "step": 17622 + }, + { + "epoch": 4.765548945375879, + "grad_norm": 4.5, + "learning_rate": 0.0021038274396466615, + "loss": 2.9256, + "mean_token_accuracy": 0.426935613155365, + "num_tokens": 9008745170.0, + "step": 17623 + }, + { + "epoch": 4.765819361817199, + "grad_norm": 4.5, + "learning_rate": 0.002103588803443124, + "loss": 2.8216, + "mean_token_accuracy": 0.4678589403629303, + "num_tokens": 9009205219.0, + "step": 17624 + }, + { + "epoch": 4.766089778258518, + "grad_norm": 4.6875, + "learning_rate": 0.0021033504402076433, + "loss": 2.5781, + "mean_token_accuracy": 0.44937485456466675, + "num_tokens": 9009729250.0, + "step": 17625 + }, + { + "epoch": 4.766360194699837, + "grad_norm": 3.09375, + "learning_rate": 0.002103112349947535, + "loss": 2.8314, + "mean_token_accuracy": 0.4530925750732422, + "num_tokens": 9010216412.0, + "step": 17626 + }, + { + "epoch": 4.766630611141157, + "grad_norm": 4.03125, + "learning_rate": 0.0021028745326701036, + "loss": 2.859, + "mean_token_accuracy": 0.43748047947883606, + "num_tokens": 9010689946.0, + "step": 17627 + }, + { + "epoch": 4.7669010275824775, + "grad_norm": 3.5, + "learning_rate": 0.0021026369883826476, + "loss": 2.6683, + "mean_token_accuracy": 0.4295755624771118, + "num_tokens": 9011214027.0, + "step": 17628 + }, + { + "epoch": 4.767171444023797, + "grad_norm": 3.484375, + "learning_rate": 0.002102399717092453, + "loss": 2.8438, + "mean_token_accuracy": 0.44054853916168213, + "num_tokens": 9011738284.0, + "step": 17629 + }, + { + "epoch": 4.767441860465116, + "grad_norm": 4.5, + "learning_rate": 0.0021021627188068017, + "loss": 2.7535, + "mean_token_accuracy": 0.43912047147750854, + "num_tokens": 9012262524.0, + "step": 17630 + }, + { + "epoch": 4.767712276906436, + "grad_norm": 35.75, + "learning_rate": 0.0021019259935329652, + "loss": 3.3152, + "mean_token_accuracy": 0.4166932702064514, + "num_tokens": 9012786625.0, + "step": 17631 + }, + { + "epoch": 4.767982693347756, + "grad_norm": 6.875, + "learning_rate": 0.002101689541278206, + "loss": 2.7029, + "mean_token_accuracy": 0.4476591944694519, + "num_tokens": 9013310897.0, + "step": 17632 + }, + { + "epoch": 4.768253109789075, + "grad_norm": 2.546875, + "learning_rate": 0.002101453362049781, + "loss": 2.777, + "mean_token_accuracy": 0.4442971348762512, + "num_tokens": 9013791085.0, + "step": 17633 + }, + { + "epoch": 4.768523526230394, + "grad_norm": 3.28125, + "learning_rate": 0.002101217455854934, + "loss": 2.6676, + "mean_token_accuracy": 0.4372202754020691, + "num_tokens": 9014315267.0, + "step": 17634 + }, + { + "epoch": 4.7687939426717145, + "grad_norm": 3.109375, + "learning_rate": 0.0021009818227009074, + "loss": 2.7911, + "mean_token_accuracy": 0.4299156963825226, + "num_tokens": 9014839319.0, + "step": 17635 + }, + { + "epoch": 4.769064359113034, + "grad_norm": 3.65625, + "learning_rate": 0.002100746462594927, + "loss": 2.7093, + "mean_token_accuracy": 0.449712336063385, + "num_tokens": 9015312784.0, + "step": 17636 + }, + { + "epoch": 4.769334775554354, + "grad_norm": 3.15625, + "learning_rate": 0.0021005113755442166, + "loss": 2.7661, + "mean_token_accuracy": 0.440095990896225, + "num_tokens": 9015837007.0, + "step": 17637 + }, + { + "epoch": 4.769605191995673, + "grad_norm": 3.25, + "learning_rate": 0.002100276561555989, + "loss": 2.9209, + "mean_token_accuracy": 0.41189056634902954, + "num_tokens": 9016337932.0, + "step": 17638 + }, + { + "epoch": 4.769875608436993, + "grad_norm": 4.09375, + "learning_rate": 0.0021000420206374495, + "loss": 2.9133, + "mean_token_accuracy": 0.43006905913352966, + "num_tokens": 9016835672.0, + "step": 17639 + }, + { + "epoch": 4.770146024878312, + "grad_norm": 4.375, + "learning_rate": 0.0020998077527957927, + "loss": 2.77, + "mean_token_accuracy": 0.4486953020095825, + "num_tokens": 9017348953.0, + "step": 17640 + }, + { + "epoch": 4.770416441319632, + "grad_norm": 4.65625, + "learning_rate": 0.0020995737580382086, + "loss": 2.8905, + "mean_token_accuracy": 0.42934897541999817, + "num_tokens": 9017819590.0, + "step": 17641 + }, + { + "epoch": 4.770686857760952, + "grad_norm": 4.625, + "learning_rate": 0.002099340036371876, + "loss": 2.7222, + "mean_token_accuracy": 0.4539632797241211, + "num_tokens": 9018343865.0, + "step": 17642 + }, + { + "epoch": 4.770957274202272, + "grad_norm": 3.859375, + "learning_rate": 0.002099106587803966, + "loss": 2.7494, + "mean_token_accuracy": 0.44520217180252075, + "num_tokens": 9018868068.0, + "step": 17643 + }, + { + "epoch": 4.771227690643591, + "grad_norm": 4.34375, + "learning_rate": 0.0020988734123416424, + "loss": 2.7223, + "mean_token_accuracy": 0.4428287744522095, + "num_tokens": 9019387431.0, + "step": 17644 + }, + { + "epoch": 4.771498107084911, + "grad_norm": 4.25, + "learning_rate": 0.0020986405099920574, + "loss": 2.683, + "mean_token_accuracy": 0.4440329968929291, + "num_tokens": 9019911571.0, + "step": 17645 + }, + { + "epoch": 4.77176852352623, + "grad_norm": 4.25, + "learning_rate": 0.00209840788076236, + "loss": 2.8982, + "mean_token_accuracy": 0.4264686107635498, + "num_tokens": 9020435842.0, + "step": 17646 + }, + { + "epoch": 4.77203893996755, + "grad_norm": 3.859375, + "learning_rate": 0.0020981755246596867, + "loss": 2.8138, + "mean_token_accuracy": 0.4317796230316162, + "num_tokens": 9020960109.0, + "step": 17647 + }, + { + "epoch": 4.772309356408869, + "grad_norm": 3.71875, + "learning_rate": 0.0020979434416911664, + "loss": 2.7858, + "mean_token_accuracy": 0.43625563383102417, + "num_tokens": 9021484247.0, + "step": 17648 + }, + { + "epoch": 4.7725797728501895, + "grad_norm": 3.703125, + "learning_rate": 0.002097711631863921, + "loss": 2.8846, + "mean_token_accuracy": 0.43352383375167847, + "num_tokens": 9022008413.0, + "step": 17649 + }, + { + "epoch": 4.772850189291509, + "grad_norm": 4.1875, + "learning_rate": 0.002097480095185062, + "loss": 2.6738, + "mean_token_accuracy": 0.44720330834388733, + "num_tokens": 9022532619.0, + "step": 17650 + }, + { + "epoch": 4.773120605732829, + "grad_norm": 13.0625, + "learning_rate": 0.0020972488316616947, + "loss": 2.4866, + "mean_token_accuracy": 0.4895296096801758, + "num_tokens": 9023052285.0, + "step": 17651 + }, + { + "epoch": 4.773391022174148, + "grad_norm": 11.375, + "learning_rate": 0.002097017841300915, + "loss": 2.618, + "mean_token_accuracy": 0.4439588189125061, + "num_tokens": 9023576342.0, + "step": 17652 + }, + { + "epoch": 4.773661438615468, + "grad_norm": 3.25, + "learning_rate": 0.002096787124109809, + "loss": 2.6981, + "mean_token_accuracy": 0.43620985746383667, + "num_tokens": 9024100535.0, + "step": 17653 + }, + { + "epoch": 4.773931855056787, + "grad_norm": 3.203125, + "learning_rate": 0.002096556680095457, + "loss": 2.7184, + "mean_token_accuracy": 0.4541740119457245, + "num_tokens": 9024624735.0, + "step": 17654 + }, + { + "epoch": 4.774202271498107, + "grad_norm": 3.96875, + "learning_rate": 0.0020963265092649296, + "loss": 2.6974, + "mean_token_accuracy": 0.4476066827774048, + "num_tokens": 9025148943.0, + "step": 17655 + }, + { + "epoch": 4.774472687939427, + "grad_norm": 3.75, + "learning_rate": 0.002096096611625288, + "loss": 2.7643, + "mean_token_accuracy": 0.4418204724788666, + "num_tokens": 9025673178.0, + "step": 17656 + }, + { + "epoch": 4.774743104380747, + "grad_norm": 4.375, + "learning_rate": 0.0020958669871835884, + "loss": 2.6202, + "mean_token_accuracy": 0.4411475658416748, + "num_tokens": 9026197297.0, + "step": 17657 + }, + { + "epoch": 4.775013520822066, + "grad_norm": 4.03125, + "learning_rate": 0.002095637635946875, + "loss": 2.5665, + "mean_token_accuracy": 0.43144690990448, + "num_tokens": 9026721503.0, + "step": 17658 + }, + { + "epoch": 4.775283937263386, + "grad_norm": 3.40625, + "learning_rate": 0.0020954085579221837, + "loss": 2.694, + "mean_token_accuracy": 0.45637887716293335, + "num_tokens": 9027222900.0, + "step": 17659 + }, + { + "epoch": 4.775554353704705, + "grad_norm": 3.796875, + "learning_rate": 0.0020951797531165457, + "loss": 2.6544, + "mean_token_accuracy": 0.4596065580844879, + "num_tokens": 9027722244.0, + "step": 17660 + }, + { + "epoch": 4.775824770146025, + "grad_norm": 3.984375, + "learning_rate": 0.002094951221536979, + "loss": 2.7149, + "mean_token_accuracy": 0.45447683334350586, + "num_tokens": 9028246495.0, + "step": 17661 + }, + { + "epoch": 4.776095186587344, + "grad_norm": 4.25, + "learning_rate": 0.0020947229631904972, + "loss": 2.6939, + "mean_token_accuracy": 0.4370292127132416, + "num_tokens": 9028770768.0, + "step": 17662 + }, + { + "epoch": 4.7763656030286645, + "grad_norm": 3.65625, + "learning_rate": 0.0020944949780841047, + "loss": 2.6692, + "mean_token_accuracy": 0.45141300559043884, + "num_tokens": 9029250168.0, + "step": 17663 + }, + { + "epoch": 4.776636019469984, + "grad_norm": 3.25, + "learning_rate": 0.002094267266224795, + "loss": 2.8762, + "mean_token_accuracy": 0.43194618821144104, + "num_tokens": 9029774404.0, + "step": 17664 + }, + { + "epoch": 4.776906435911304, + "grad_norm": 4.75, + "learning_rate": 0.0020940398276195547, + "loss": 3.0509, + "mean_token_accuracy": 0.42227721214294434, + "num_tokens": 9030298440.0, + "step": 17665 + }, + { + "epoch": 4.777176852352623, + "grad_norm": 4.5, + "learning_rate": 0.0020938126622753643, + "loss": 2.6767, + "mean_token_accuracy": 0.46233269572257996, + "num_tokens": 9030822508.0, + "step": 17666 + }, + { + "epoch": 4.777447268793942, + "grad_norm": 2.953125, + "learning_rate": 0.0020935857701991933, + "loss": 2.6886, + "mean_token_accuracy": 0.4776118993759155, + "num_tokens": 9031282918.0, + "step": 17667 + }, + { + "epoch": 4.777717685235262, + "grad_norm": 3.90625, + "learning_rate": 0.0020933591513980017, + "loss": 2.7587, + "mean_token_accuracy": 0.44941046833992004, + "num_tokens": 9031807180.0, + "step": 17668 + }, + { + "epoch": 4.777988101676582, + "grad_norm": 3.765625, + "learning_rate": 0.0020931328058787454, + "loss": 2.5746, + "mean_token_accuracy": 0.47238439321517944, + "num_tokens": 9032280110.0, + "step": 17669 + }, + { + "epoch": 4.778258518117902, + "grad_norm": 4.1875, + "learning_rate": 0.002092906733648367, + "loss": 2.6686, + "mean_token_accuracy": 0.46100056171417236, + "num_tokens": 9032804289.0, + "step": 17670 + }, + { + "epoch": 4.778528934559221, + "grad_norm": 28.875, + "learning_rate": 0.002092680934713804, + "loss": 2.8453, + "mean_token_accuracy": 0.45910483598709106, + "num_tokens": 9033328522.0, + "step": 17671 + }, + { + "epoch": 4.778799351000541, + "grad_norm": 7.6875, + "learning_rate": 0.002092455409081985, + "loss": 2.8538, + "mean_token_accuracy": 0.4424591362476349, + "num_tokens": 9033852796.0, + "step": 17672 + }, + { + "epoch": 4.779069767441861, + "grad_norm": 5.28125, + "learning_rate": 0.002092230156759829, + "loss": 2.3425, + "mean_token_accuracy": 0.5072733163833618, + "num_tokens": 9034377079.0, + "step": 17673 + }, + { + "epoch": 4.77934018388318, + "grad_norm": 3.734375, + "learning_rate": 0.002092005177754249, + "loss": 2.6842, + "mean_token_accuracy": 0.4242173731327057, + "num_tokens": 9034901243.0, + "step": 17674 + }, + { + "epoch": 4.779610600324499, + "grad_norm": 3.3125, + "learning_rate": 0.0020917804720721454, + "loss": 2.7589, + "mean_token_accuracy": 0.44112324714660645, + "num_tokens": 9035376019.0, + "step": 17675 + }, + { + "epoch": 4.7798810167658194, + "grad_norm": 3.578125, + "learning_rate": 0.002091556039720414, + "loss": 2.6923, + "mean_token_accuracy": 0.4686790108680725, + "num_tokens": 9035842788.0, + "step": 17676 + }, + { + "epoch": 4.780151433207139, + "grad_norm": 4.125, + "learning_rate": 0.0020913318807059427, + "loss": 2.8992, + "mean_token_accuracy": 0.42949753999710083, + "num_tokens": 9036367063.0, + "step": 17677 + }, + { + "epoch": 4.780421849648459, + "grad_norm": 4.28125, + "learning_rate": 0.0020911079950356078, + "loss": 2.8881, + "mean_token_accuracy": 0.43041330575942993, + "num_tokens": 9036891233.0, + "step": 17678 + }, + { + "epoch": 4.780692266089778, + "grad_norm": 4.21875, + "learning_rate": 0.002090884382716278, + "loss": 2.8362, + "mean_token_accuracy": 0.421128511428833, + "num_tokens": 9037415429.0, + "step": 17679 + }, + { + "epoch": 4.780962682531098, + "grad_norm": 4.6875, + "learning_rate": 0.0020906610437548163, + "loss": 2.8305, + "mean_token_accuracy": 0.445245623588562, + "num_tokens": 9037939647.0, + "step": 17680 + }, + { + "epoch": 4.781233098972417, + "grad_norm": 3.796875, + "learning_rate": 0.002090437978158074, + "loss": 2.8747, + "mean_token_accuracy": 0.4325968027114868, + "num_tokens": 9038407900.0, + "step": 17681 + }, + { + "epoch": 4.781503515413737, + "grad_norm": 3.328125, + "learning_rate": 0.0020902151859328957, + "loss": 2.7154, + "mean_token_accuracy": 0.44712620973587036, + "num_tokens": 9038932094.0, + "step": 17682 + }, + { + "epoch": 4.7817739318550565, + "grad_norm": 3.953125, + "learning_rate": 0.002089992667086118, + "loss": 2.82, + "mean_token_accuracy": 0.4340636730194092, + "num_tokens": 9039385663.0, + "step": 17683 + }, + { + "epoch": 4.782044348296377, + "grad_norm": 3.984375, + "learning_rate": 0.002089770421624568, + "loss": 2.7679, + "mean_token_accuracy": 0.4372550845146179, + "num_tokens": 9039855909.0, + "step": 17684 + }, + { + "epoch": 4.782314764737696, + "grad_norm": 4.15625, + "learning_rate": 0.002089548449555064, + "loss": 2.7347, + "mean_token_accuracy": 0.45029234886169434, + "num_tokens": 9040322389.0, + "step": 17685 + }, + { + "epoch": 4.782585181179016, + "grad_norm": 3.71875, + "learning_rate": 0.0020893267508844183, + "loss": 2.9563, + "mean_token_accuracy": 0.4390450417995453, + "num_tokens": 9040825945.0, + "step": 17686 + }, + { + "epoch": 4.782855597620335, + "grad_norm": 4.875, + "learning_rate": 0.0020891053256194315, + "loss": 2.7955, + "mean_token_accuracy": 0.43903064727783203, + "num_tokens": 9041313705.0, + "step": 17687 + }, + { + "epoch": 4.783126014061655, + "grad_norm": 3.515625, + "learning_rate": 0.0020888841737669002, + "loss": 2.5111, + "mean_token_accuracy": 0.48181837797164917, + "num_tokens": 9041778079.0, + "step": 17688 + }, + { + "epoch": 4.783396430502974, + "grad_norm": 3.921875, + "learning_rate": 0.0020886632953336067, + "loss": 2.7507, + "mean_token_accuracy": 0.44704771041870117, + "num_tokens": 9042302060.0, + "step": 17689 + }, + { + "epoch": 4.7836668469442944, + "grad_norm": 3.78125, + "learning_rate": 0.0020884426903263298, + "loss": 2.6661, + "mean_token_accuracy": 0.44519683718681335, + "num_tokens": 9042826198.0, + "step": 17690 + }, + { + "epoch": 4.783937263385614, + "grad_norm": 21.875, + "learning_rate": 0.0020882223587518384, + "loss": 2.6275, + "mean_token_accuracy": 0.440504252910614, + "num_tokens": 9043308228.0, + "step": 17691 + }, + { + "epoch": 4.784207679826934, + "grad_norm": 7.15625, + "learning_rate": 0.002088002300616894, + "loss": 2.8555, + "mean_token_accuracy": 0.43503597378730774, + "num_tokens": 9043832327.0, + "step": 17692 + }, + { + "epoch": 4.784478096268253, + "grad_norm": 2.5, + "learning_rate": 0.0020877825159282474, + "loss": 2.6538, + "mean_token_accuracy": 0.459268182516098, + "num_tokens": 9044356505.0, + "step": 17693 + }, + { + "epoch": 4.784748512709573, + "grad_norm": 3.515625, + "learning_rate": 0.002087563004692641, + "loss": 2.8344, + "mean_token_accuracy": 0.43462157249450684, + "num_tokens": 9044880677.0, + "step": 17694 + }, + { + "epoch": 4.785018929150892, + "grad_norm": 3.984375, + "learning_rate": 0.0020873437669168124, + "loss": 2.5483, + "mean_token_accuracy": 0.4765833616256714, + "num_tokens": 9045404859.0, + "step": 17695 + }, + { + "epoch": 4.785289345592212, + "grad_norm": 3.953125, + "learning_rate": 0.002087124802607487, + "loss": 2.9262, + "mean_token_accuracy": 0.42204225063323975, + "num_tokens": 9045911309.0, + "step": 17696 + }, + { + "epoch": 4.7855597620335315, + "grad_norm": 3.578125, + "learning_rate": 0.0020869061117713837, + "loss": 2.7677, + "mean_token_accuracy": 0.43747633695602417, + "num_tokens": 9046392307.0, + "step": 17697 + }, + { + "epoch": 4.785830178474852, + "grad_norm": 111.0, + "learning_rate": 0.0020866876944152135, + "loss": 4.269, + "mean_token_accuracy": 0.38804227113723755, + "num_tokens": 9046916562.0, + "step": 17698 + }, + { + "epoch": 4.786100594916171, + "grad_norm": 5.75, + "learning_rate": 0.0020864695505456764, + "loss": 2.7904, + "mean_token_accuracy": 0.4374646842479706, + "num_tokens": 9047440744.0, + "step": 17699 + }, + { + "epoch": 4.786371011357491, + "grad_norm": 3.0625, + "learning_rate": 0.0020862516801694662, + "loss": 2.8592, + "mean_token_accuracy": 0.4238305687904358, + "num_tokens": 9047961808.0, + "step": 17700 + }, + { + "epoch": 4.78664142779881, + "grad_norm": 3.71875, + "learning_rate": 0.0020860340832932686, + "loss": 2.6637, + "mean_token_accuracy": 0.44419392943382263, + "num_tokens": 9048486058.0, + "step": 17701 + }, + { + "epoch": 4.78691184424013, + "grad_norm": 3.640625, + "learning_rate": 0.0020858167599237597, + "loss": 2.7511, + "mean_token_accuracy": 0.4243357181549072, + "num_tokens": 9049010322.0, + "step": 17702 + }, + { + "epoch": 4.787182260681449, + "grad_norm": 3.890625, + "learning_rate": 0.002085599710067607, + "loss": 2.9597, + "mean_token_accuracy": 0.40847325325012207, + "num_tokens": 9049534421.0, + "step": 17703 + }, + { + "epoch": 4.7874526771227695, + "grad_norm": 4.25, + "learning_rate": 0.0020853829337314704, + "loss": 2.8955, + "mean_token_accuracy": 0.4382022023200989, + "num_tokens": 9050058697.0, + "step": 17704 + }, + { + "epoch": 4.787723093564089, + "grad_norm": 7.03125, + "learning_rate": 0.0020851664309220024, + "loss": 2.8034, + "mean_token_accuracy": 0.45393961668014526, + "num_tokens": 9050582929.0, + "step": 17705 + }, + { + "epoch": 4.787993510005409, + "grad_norm": 3.03125, + "learning_rate": 0.0020849502016458453, + "loss": 2.5883, + "mean_token_accuracy": 0.4773845672607422, + "num_tokens": 9051107185.0, + "step": 17706 + }, + { + "epoch": 4.788263926446728, + "grad_norm": 3.6875, + "learning_rate": 0.0020847342459096338, + "loss": 2.9036, + "mean_token_accuracy": 0.4374697208404541, + "num_tokens": 9051631416.0, + "step": 17707 + }, + { + "epoch": 4.788534342888047, + "grad_norm": 4.25, + "learning_rate": 0.002084518563719992, + "loss": 2.7219, + "mean_token_accuracy": 0.44763028621673584, + "num_tokens": 9052155610.0, + "step": 17708 + }, + { + "epoch": 4.788804759329367, + "grad_norm": 3.671875, + "learning_rate": 0.0020843031550835402, + "loss": 2.8473, + "mean_token_accuracy": 0.4383281469345093, + "num_tokens": 9052679803.0, + "step": 17709 + }, + { + "epoch": 4.789075175770687, + "grad_norm": 3.859375, + "learning_rate": 0.002084088020006888, + "loss": 2.6257, + "mean_token_accuracy": 0.4392731487751007, + "num_tokens": 9053203986.0, + "step": 17710 + }, + { + "epoch": 4.7893455922120065, + "grad_norm": 19.375, + "learning_rate": 0.002083873158496634, + "loss": 2.7395, + "mean_token_accuracy": 0.49352747201919556, + "num_tokens": 9053688295.0, + "step": 17711 + }, + { + "epoch": 4.789616008653326, + "grad_norm": 120.5, + "learning_rate": 0.0020836585705593727, + "loss": 3.3396, + "mean_token_accuracy": 0.3763386607170105, + "num_tokens": 9054212528.0, + "step": 17712 + }, + { + "epoch": 4.789886425094646, + "grad_norm": 9.25, + "learning_rate": 0.002083444256201688, + "loss": 2.7561, + "mean_token_accuracy": 0.4654605984687805, + "num_tokens": 9054671902.0, + "step": 17713 + }, + { + "epoch": 4.790156841535966, + "grad_norm": 3.484375, + "learning_rate": 0.0020832302154301548, + "loss": 2.7992, + "mean_token_accuracy": 0.4390750825405121, + "num_tokens": 9055172678.0, + "step": 17714 + }, + { + "epoch": 4.790427257977285, + "grad_norm": 4.4375, + "learning_rate": 0.0020830164482513415, + "loss": 2.8104, + "mean_token_accuracy": 0.43776965141296387, + "num_tokens": 9055658420.0, + "step": 17715 + }, + { + "epoch": 4.790697674418604, + "grad_norm": 3.875, + "learning_rate": 0.0020828029546718064, + "loss": 2.717, + "mean_token_accuracy": 0.43979763984680176, + "num_tokens": 9056182644.0, + "step": 17716 + }, + { + "epoch": 4.790968090859924, + "grad_norm": 3.84375, + "learning_rate": 0.0020825897346981002, + "loss": 2.9284, + "mean_token_accuracy": 0.44851893186569214, + "num_tokens": 9056650095.0, + "step": 17717 + }, + { + "epoch": 4.791238507301244, + "grad_norm": 3.953125, + "learning_rate": 0.002082376788336766, + "loss": 2.8663, + "mean_token_accuracy": 0.42528313398361206, + "num_tokens": 9057174310.0, + "step": 17718 + }, + { + "epoch": 4.791508923742564, + "grad_norm": 3.25, + "learning_rate": 0.0020821641155943362, + "loss": 2.5432, + "mean_token_accuracy": 0.4686456024646759, + "num_tokens": 9057651539.0, + "step": 17719 + }, + { + "epoch": 4.791779340183883, + "grad_norm": 3.296875, + "learning_rate": 0.0020819517164773365, + "loss": 2.7115, + "mean_token_accuracy": 0.44679969549179077, + "num_tokens": 9058167771.0, + "step": 17720 + }, + { + "epoch": 4.792049756625203, + "grad_norm": 3.5625, + "learning_rate": 0.0020817395909922863, + "loss": 2.7357, + "mean_token_accuracy": 0.44252070784568787, + "num_tokens": 9058691942.0, + "step": 17721 + }, + { + "epoch": 4.792320173066522, + "grad_norm": 3.796875, + "learning_rate": 0.002081527739145691, + "loss": 2.8997, + "mean_token_accuracy": 0.4249228239059448, + "num_tokens": 9059216167.0, + "step": 17722 + }, + { + "epoch": 4.792590589507842, + "grad_norm": 3.953125, + "learning_rate": 0.002081316160944052, + "loss": 2.8695, + "mean_token_accuracy": 0.4428752064704895, + "num_tokens": 9059688528.0, + "step": 17723 + }, + { + "epoch": 4.792861005949161, + "grad_norm": 3.625, + "learning_rate": 0.002081104856393863, + "loss": 2.8746, + "mean_token_accuracy": 0.4253644347190857, + "num_tokens": 9060212781.0, + "step": 17724 + }, + { + "epoch": 4.7931314223904815, + "grad_norm": 4.9375, + "learning_rate": 0.002080893825501604, + "loss": 2.7563, + "mean_token_accuracy": 0.43718674778938293, + "num_tokens": 9060736922.0, + "step": 17725 + }, + { + "epoch": 4.793401838831801, + "grad_norm": 3.5625, + "learning_rate": 0.002080683068273752, + "loss": 2.8238, + "mean_token_accuracy": 0.44370904564857483, + "num_tokens": 9061238502.0, + "step": 17726 + }, + { + "epoch": 4.793672255273121, + "grad_norm": 4.46875, + "learning_rate": 0.0020804725847167744, + "loss": 2.7469, + "mean_token_accuracy": 0.4621717929840088, + "num_tokens": 9061762656.0, + "step": 17727 + }, + { + "epoch": 4.79394267171444, + "grad_norm": 5.34375, + "learning_rate": 0.002080262374837128, + "loss": 2.8182, + "mean_token_accuracy": 0.43349677324295044, + "num_tokens": 9062231102.0, + "step": 17728 + }, + { + "epoch": 4.79421308815576, + "grad_norm": 4.0, + "learning_rate": 0.0020800524386412637, + "loss": 2.8088, + "mean_token_accuracy": 0.4446682929992676, + "num_tokens": 9062755271.0, + "step": 17729 + }, + { + "epoch": 4.794483504597079, + "grad_norm": 3.953125, + "learning_rate": 0.002079842776135622, + "loss": 2.841, + "mean_token_accuracy": 0.44128289818763733, + "num_tokens": 9063279466.0, + "step": 17730 + }, + { + "epoch": 4.794753921038399, + "grad_norm": 18.0, + "learning_rate": 0.0020796333873266365, + "loss": 2.5786, + "mean_token_accuracy": 0.467558354139328, + "num_tokens": 9063773647.0, + "step": 17731 + }, + { + "epoch": 4.795024337479719, + "grad_norm": 8.5625, + "learning_rate": 0.002079424272220731, + "loss": 2.9889, + "mean_token_accuracy": 0.4315623641014099, + "num_tokens": 9064240631.0, + "step": 17732 + }, + { + "epoch": 4.795294753921039, + "grad_norm": 3.359375, + "learning_rate": 0.0020792154308243225, + "loss": 2.7053, + "mean_token_accuracy": 0.4432569146156311, + "num_tokens": 9064764804.0, + "step": 17733 + }, + { + "epoch": 4.795565170362358, + "grad_norm": 4.9375, + "learning_rate": 0.0020790068631438194, + "loss": 2.593, + "mean_token_accuracy": 0.46094459295272827, + "num_tokens": 9065285658.0, + "step": 17734 + }, + { + "epoch": 4.795835586803678, + "grad_norm": 4.34375, + "learning_rate": 0.0020787985691856216, + "loss": 2.7537, + "mean_token_accuracy": 0.46302270889282227, + "num_tokens": 9065791490.0, + "step": 17735 + }, + { + "epoch": 4.796106003244997, + "grad_norm": 3.6875, + "learning_rate": 0.002078590548956118, + "loss": 2.7574, + "mean_token_accuracy": 0.45194628834724426, + "num_tokens": 9066315576.0, + "step": 17736 + }, + { + "epoch": 4.796376419686317, + "grad_norm": 4.34375, + "learning_rate": 0.0020783828024616927, + "loss": 2.7562, + "mean_token_accuracy": 0.450066477060318, + "num_tokens": 9066778619.0, + "step": 17737 + }, + { + "epoch": 4.796646836127636, + "grad_norm": 4.375, + "learning_rate": 0.002078175329708721, + "loss": 2.6045, + "mean_token_accuracy": 0.42749524116516113, + "num_tokens": 9067302861.0, + "step": 17738 + }, + { + "epoch": 4.7969172525689565, + "grad_norm": 3.34375, + "learning_rate": 0.002077968130703567, + "loss": 2.8815, + "mean_token_accuracy": 0.42694228887557983, + "num_tokens": 9067827039.0, + "step": 17739 + }, + { + "epoch": 4.797187669010276, + "grad_norm": 3.328125, + "learning_rate": 0.0020777612054525877, + "loss": 2.7193, + "mean_token_accuracy": 0.43553274869918823, + "num_tokens": 9068351254.0, + "step": 17740 + }, + { + "epoch": 4.797458085451596, + "grad_norm": 3.625, + "learning_rate": 0.002077554553962134, + "loss": 2.8312, + "mean_token_accuracy": 0.4386600852012634, + "num_tokens": 9068857904.0, + "step": 17741 + }, + { + "epoch": 4.797728501892915, + "grad_norm": 3.625, + "learning_rate": 0.0020773481762385456, + "loss": 2.5173, + "mean_token_accuracy": 0.4660191535949707, + "num_tokens": 9069382124.0, + "step": 17742 + }, + { + "epoch": 4.797998918334235, + "grad_norm": 3.484375, + "learning_rate": 0.002077142072288155, + "loss": 2.5844, + "mean_token_accuracy": 0.4683123826980591, + "num_tokens": 9069906167.0, + "step": 17743 + }, + { + "epoch": 4.798269334775554, + "grad_norm": 3.84375, + "learning_rate": 0.0020769362421172864, + "loss": 2.9187, + "mean_token_accuracy": 0.43743011355400085, + "num_tokens": 9070430453.0, + "step": 17744 + }, + { + "epoch": 4.798539751216874, + "grad_norm": 4.0, + "learning_rate": 0.002076730685732254, + "loss": 2.8636, + "mean_token_accuracy": 0.4276803135871887, + "num_tokens": 9070954720.0, + "step": 17745 + }, + { + "epoch": 4.798810167658194, + "grad_norm": 4.3125, + "learning_rate": 0.0020765254031393657, + "loss": 2.8287, + "mean_token_accuracy": 0.4592061936855316, + "num_tokens": 9071421150.0, + "step": 17746 + }, + { + "epoch": 4.799080584099514, + "grad_norm": 4.875, + "learning_rate": 0.0020763203943449213, + "loss": 2.604, + "mean_token_accuracy": 0.4674825966358185, + "num_tokens": 9071945397.0, + "step": 17747 + }, + { + "epoch": 4.799351000540833, + "grad_norm": 3.3125, + "learning_rate": 0.0020761156593552097, + "loss": 2.7005, + "mean_token_accuracy": 0.46457210183143616, + "num_tokens": 9072408473.0, + "step": 17748 + }, + { + "epoch": 4.799621416982152, + "grad_norm": 3.96875, + "learning_rate": 0.0020759111981765127, + "loss": 2.7987, + "mean_token_accuracy": 0.44827139377593994, + "num_tokens": 9072907678.0, + "step": 17749 + }, + { + "epoch": 4.799891833423472, + "grad_norm": 4.21875, + "learning_rate": 0.002075707010815105, + "loss": 2.7988, + "mean_token_accuracy": 0.4516493082046509, + "num_tokens": 9073429609.0, + "step": 17750 + }, + { + "epoch": 4.800162249864792, + "grad_norm": 8.5625, + "learning_rate": 0.00207550309727725, + "loss": 2.1083, + "mean_token_accuracy": 0.516231894493103, + "num_tokens": 9073889167.0, + "step": 17751 + }, + { + "epoch": 4.800432666306111, + "grad_norm": 6.34375, + "learning_rate": 0.0020752994575692057, + "loss": 2.8054, + "mean_token_accuracy": 0.4333287179470062, + "num_tokens": 9074413389.0, + "step": 17752 + }, + { + "epoch": 4.800703082747431, + "grad_norm": 3.109375, + "learning_rate": 0.00207509609169722, + "loss": 2.7103, + "mean_token_accuracy": 0.44757676124572754, + "num_tokens": 9074937589.0, + "step": 17753 + }, + { + "epoch": 4.800973499188751, + "grad_norm": 3.671875, + "learning_rate": 0.0020748929996675324, + "loss": 2.7517, + "mean_token_accuracy": 0.4431579113006592, + "num_tokens": 9075461751.0, + "step": 17754 + }, + { + "epoch": 4.801243915630071, + "grad_norm": 3.90625, + "learning_rate": 0.0020746901814863747, + "loss": 2.7289, + "mean_token_accuracy": 0.4467005133628845, + "num_tokens": 9075935047.0, + "step": 17755 + }, + { + "epoch": 4.80151433207139, + "grad_norm": 2.96875, + "learning_rate": 0.00207448763715997, + "loss": 2.5891, + "mean_token_accuracy": 0.4631626605987549, + "num_tokens": 9076459291.0, + "step": 17756 + }, + { + "epoch": 4.801784748512709, + "grad_norm": 4.375, + "learning_rate": 0.002074285366694532, + "loss": 2.944, + "mean_token_accuracy": 0.4233163297176361, + "num_tokens": 9076983346.0, + "step": 17757 + }, + { + "epoch": 4.802055164954029, + "grad_norm": 7.9375, + "learning_rate": 0.0020740833700962687, + "loss": 2.5381, + "mean_token_accuracy": 0.4537585377693176, + "num_tokens": 9077507567.0, + "step": 17758 + }, + { + "epoch": 4.8023255813953485, + "grad_norm": 4.40625, + "learning_rate": 0.0020738816473713758, + "loss": 2.9149, + "mean_token_accuracy": 0.44167065620422363, + "num_tokens": 9077973094.0, + "step": 17759 + }, + { + "epoch": 4.802595997836669, + "grad_norm": 3.40625, + "learning_rate": 0.002073680198526046, + "loss": 2.6276, + "mean_token_accuracy": 0.4373057186603546, + "num_tokens": 9078497257.0, + "step": 17760 + }, + { + "epoch": 4.802866414277988, + "grad_norm": 3.421875, + "learning_rate": 0.0020734790235664557, + "loss": 2.7369, + "mean_token_accuracy": 0.45180580019950867, + "num_tokens": 9078979181.0, + "step": 17761 + }, + { + "epoch": 4.803136830719308, + "grad_norm": 3.984375, + "learning_rate": 0.002073278122498782, + "loss": 2.8637, + "mean_token_accuracy": 0.44979193806648254, + "num_tokens": 9079503437.0, + "step": 17762 + }, + { + "epoch": 4.803407247160627, + "grad_norm": 4.1875, + "learning_rate": 0.0020730774953291865, + "loss": 2.796, + "mean_token_accuracy": 0.4455464482307434, + "num_tokens": 9080027642.0, + "step": 17763 + }, + { + "epoch": 4.803677663601947, + "grad_norm": 3.203125, + "learning_rate": 0.0020728771420638263, + "loss": 2.88, + "mean_token_accuracy": 0.4481887221336365, + "num_tokens": 9080546411.0, + "step": 17764 + }, + { + "epoch": 4.803948080043266, + "grad_norm": 5.03125, + "learning_rate": 0.0020726770627088483, + "loss": 2.905, + "mean_token_accuracy": 0.4220409393310547, + "num_tokens": 9081070670.0, + "step": 17765 + }, + { + "epoch": 4.804218496484586, + "grad_norm": 3.390625, + "learning_rate": 0.002072477257270392, + "loss": 2.7408, + "mean_token_accuracy": 0.4457452893257141, + "num_tokens": 9081594930.0, + "step": 17766 + }, + { + "epoch": 4.804488912925906, + "grad_norm": 4.375, + "learning_rate": 0.002072277725754587, + "loss": 2.6991, + "mean_token_accuracy": 0.4423103928565979, + "num_tokens": 9082119071.0, + "step": 17767 + }, + { + "epoch": 4.804759329367226, + "grad_norm": 3.453125, + "learning_rate": 0.0020720784681675557, + "loss": 2.5675, + "mean_token_accuracy": 0.47440943121910095, + "num_tokens": 9082643268.0, + "step": 17768 + }, + { + "epoch": 4.805029745808545, + "grad_norm": 4.0625, + "learning_rate": 0.0020718794845154135, + "loss": 2.7229, + "mean_token_accuracy": 0.45032089948654175, + "num_tokens": 9083167391.0, + "step": 17769 + }, + { + "epoch": 4.805300162249865, + "grad_norm": 68.5, + "learning_rate": 0.002071680774804264, + "loss": 2.7627, + "mean_token_accuracy": 0.4881872534751892, + "num_tokens": 9083674726.0, + "step": 17770 + }, + { + "epoch": 4.805570578691184, + "grad_norm": 9.9375, + "learning_rate": 0.0020714823390402043, + "loss": 2.2259, + "mean_token_accuracy": 0.50160151720047, + "num_tokens": 9084198901.0, + "step": 17771 + }, + { + "epoch": 4.805840995132504, + "grad_norm": 8.75, + "learning_rate": 0.0020712841772293237, + "loss": 2.9535, + "mean_token_accuracy": 0.42219865322113037, + "num_tokens": 9084723095.0, + "step": 17772 + }, + { + "epoch": 4.8061114115738235, + "grad_norm": 4.625, + "learning_rate": 0.0020710862893777017, + "loss": 2.6275, + "mean_token_accuracy": 0.45509275794029236, + "num_tokens": 9085247259.0, + "step": 17773 + }, + { + "epoch": 4.806381828015144, + "grad_norm": 3.953125, + "learning_rate": 0.002070888675491411, + "loss": 2.6706, + "mean_token_accuracy": 0.4606686234474182, + "num_tokens": 9085753865.0, + "step": 17774 + }, + { + "epoch": 4.806652244456463, + "grad_norm": 3.296875, + "learning_rate": 0.002070691335576515, + "loss": 2.8418, + "mean_token_accuracy": 0.4468570053577423, + "num_tokens": 9086274559.0, + "step": 17775 + }, + { + "epoch": 4.806922660897783, + "grad_norm": 3.609375, + "learning_rate": 0.0020704942696390684, + "loss": 2.7329, + "mean_token_accuracy": 0.444671630859375, + "num_tokens": 9086798585.0, + "step": 17776 + }, + { + "epoch": 4.807193077339102, + "grad_norm": 4.0625, + "learning_rate": 0.002070297477685117, + "loss": 2.6921, + "mean_token_accuracy": 0.4273679852485657, + "num_tokens": 9087275824.0, + "step": 17777 + }, + { + "epoch": 4.807463493780422, + "grad_norm": 2.921875, + "learning_rate": 0.0020701009597206997, + "loss": 2.8848, + "mean_token_accuracy": 0.4403170645236969, + "num_tokens": 9087800038.0, + "step": 17778 + }, + { + "epoch": 4.807733910221741, + "grad_norm": 4.25, + "learning_rate": 0.0020699047157518463, + "loss": 2.8065, + "mean_token_accuracy": 0.43721240758895874, + "num_tokens": 9088299176.0, + "step": 17779 + }, + { + "epoch": 4.808004326663061, + "grad_norm": 3.171875, + "learning_rate": 0.002069708745784577, + "loss": 2.6499, + "mean_token_accuracy": 0.4626309275627136, + "num_tokens": 9088823255.0, + "step": 17780 + }, + { + "epoch": 4.808274743104381, + "grad_norm": 3.765625, + "learning_rate": 0.0020695130498249066, + "loss": 2.8931, + "mean_token_accuracy": 0.4368160367012024, + "num_tokens": 9089347430.0, + "step": 17781 + }, + { + "epoch": 4.808545159545701, + "grad_norm": 3.484375, + "learning_rate": 0.002069317627878838, + "loss": 2.7395, + "mean_token_accuracy": 0.44945859909057617, + "num_tokens": 9089871713.0, + "step": 17782 + }, + { + "epoch": 4.80881557598702, + "grad_norm": 3.625, + "learning_rate": 0.0020691224799523687, + "loss": 2.7363, + "mean_token_accuracy": 0.43584465980529785, + "num_tokens": 9090395994.0, + "step": 17783 + }, + { + "epoch": 4.80908599242834, + "grad_norm": 3.15625, + "learning_rate": 0.0020689276060514854, + "loss": 2.6707, + "mean_token_accuracy": 0.4563414454460144, + "num_tokens": 9090920150.0, + "step": 17784 + }, + { + "epoch": 4.809356408869659, + "grad_norm": 3.96875, + "learning_rate": 0.002068733006182168, + "loss": 2.8546, + "mean_token_accuracy": 0.45256054401397705, + "num_tokens": 9091444405.0, + "step": 17785 + }, + { + "epoch": 4.809626825310979, + "grad_norm": 6.0625, + "learning_rate": 0.002068538680350387, + "loss": 2.6709, + "mean_token_accuracy": 0.4755374789237976, + "num_tokens": 9091968611.0, + "step": 17786 + }, + { + "epoch": 4.8098972417522985, + "grad_norm": 2.78125, + "learning_rate": 0.002068344628562105, + "loss": 2.6492, + "mean_token_accuracy": 0.45382946729660034, + "num_tokens": 9092492809.0, + "step": 17787 + }, + { + "epoch": 4.810167658193619, + "grad_norm": 3.484375, + "learning_rate": 0.002068150850823275, + "loss": 2.9212, + "mean_token_accuracy": 0.4402812123298645, + "num_tokens": 9093008073.0, + "step": 17788 + }, + { + "epoch": 4.810438074634938, + "grad_norm": 4.25, + "learning_rate": 0.0020679573471398443, + "loss": 2.7122, + "mean_token_accuracy": 0.45012426376342773, + "num_tokens": 9093532289.0, + "step": 17789 + }, + { + "epoch": 4.810708491076257, + "grad_norm": 3.828125, + "learning_rate": 0.0020677641175177505, + "loss": 2.7981, + "mean_token_accuracy": 0.4519355893135071, + "num_tokens": 9094028439.0, + "step": 17790 + }, + { + "epoch": 4.810978907517577, + "grad_norm": 21.0, + "learning_rate": 0.002067571161962921, + "loss": 2.4313, + "mean_token_accuracy": 0.5008180141448975, + "num_tokens": 9094552716.0, + "step": 17791 + }, + { + "epoch": 4.811249323958897, + "grad_norm": 50.0, + "learning_rate": 0.002067378480481277, + "loss": 2.6663, + "mean_token_accuracy": 0.468285471200943, + "num_tokens": 9095076941.0, + "step": 17792 + }, + { + "epoch": 4.811519740400216, + "grad_norm": 11.875, + "learning_rate": 0.0020671860730787305, + "loss": 2.896, + "mean_token_accuracy": 0.4160527288913727, + "num_tokens": 9095601177.0, + "step": 17793 + }, + { + "epoch": 4.8117901568415355, + "grad_norm": 3.59375, + "learning_rate": 0.002066993939761184, + "loss": 2.6201, + "mean_token_accuracy": 0.4556787610054016, + "num_tokens": 9096125397.0, + "step": 17794 + }, + { + "epoch": 4.812060573282856, + "grad_norm": 4.03125, + "learning_rate": 0.0020668020805345347, + "loss": 2.8564, + "mean_token_accuracy": 0.4560888111591339, + "num_tokens": 9096606576.0, + "step": 17795 + }, + { + "epoch": 4.812330989724176, + "grad_norm": 3.21875, + "learning_rate": 0.0020666104954046682, + "loss": 2.7752, + "mean_token_accuracy": 0.45043396949768066, + "num_tokens": 9097130745.0, + "step": 17796 + }, + { + "epoch": 4.812601406165495, + "grad_norm": 4.5625, + "learning_rate": 0.0020664191843774633, + "loss": 2.6615, + "mean_token_accuracy": 0.4571867883205414, + "num_tokens": 9097654931.0, + "step": 17797 + }, + { + "epoch": 4.812871822606814, + "grad_norm": 3.65625, + "learning_rate": 0.002066228147458789, + "loss": 2.7918, + "mean_token_accuracy": 0.41335877776145935, + "num_tokens": 9098179153.0, + "step": 17798 + }, + { + "epoch": 4.813142239048134, + "grad_norm": 4.15625, + "learning_rate": 0.002066037384654508, + "loss": 2.8677, + "mean_token_accuracy": 0.42434024810791016, + "num_tokens": 9098703405.0, + "step": 17799 + }, + { + "epoch": 4.813412655489453, + "grad_norm": 3.84375, + "learning_rate": 0.0020658468959704737, + "loss": 2.8645, + "mean_token_accuracy": 0.4395831823348999, + "num_tokens": 9099198164.0, + "step": 17800 + }, + { + "epoch": 4.8136830719307735, + "grad_norm": 3.75, + "learning_rate": 0.00206565668141253, + "loss": 2.5468, + "mean_token_accuracy": 0.44642746448516846, + "num_tokens": 9099722377.0, + "step": 17801 + }, + { + "epoch": 4.813953488372093, + "grad_norm": 3.1875, + "learning_rate": 0.0020654667409865123, + "loss": 2.7492, + "mean_token_accuracy": 0.43272852897644043, + "num_tokens": 9100246524.0, + "step": 17802 + }, + { + "epoch": 4.814223904813413, + "grad_norm": 4.1875, + "learning_rate": 0.0020652770746982514, + "loss": 2.6879, + "mean_token_accuracy": 0.4319877028465271, + "num_tokens": 9100770715.0, + "step": 17803 + }, + { + "epoch": 4.814494321254732, + "grad_norm": 2.875, + "learning_rate": 0.002065087682553564, + "loss": 2.758, + "mean_token_accuracy": 0.4560913145542145, + "num_tokens": 9101294968.0, + "step": 17804 + }, + { + "epoch": 4.814764737696052, + "grad_norm": 3.828125, + "learning_rate": 0.0020648985645582633, + "loss": 2.7907, + "mean_token_accuracy": 0.4644224941730499, + "num_tokens": 9101769250.0, + "step": 17805 + }, + { + "epoch": 4.815035154137371, + "grad_norm": 4.0625, + "learning_rate": 0.0020647097207181497, + "loss": 2.6639, + "mean_token_accuracy": 0.43776559829711914, + "num_tokens": 9102293496.0, + "step": 17806 + }, + { + "epoch": 4.815305570578691, + "grad_norm": 4.15625, + "learning_rate": 0.0020645211510390204, + "loss": 2.7704, + "mean_token_accuracy": 0.44419312477111816, + "num_tokens": 9102817686.0, + "step": 17807 + }, + { + "epoch": 4.8155759870200106, + "grad_norm": 3.671875, + "learning_rate": 0.002064332855526659, + "loss": 2.6279, + "mean_token_accuracy": 0.4487782120704651, + "num_tokens": 9103341944.0, + "step": 17808 + }, + { + "epoch": 4.815846403461331, + "grad_norm": 3.125, + "learning_rate": 0.0020641448341868437, + "loss": 2.8131, + "mean_token_accuracy": 0.4481801688671112, + "num_tokens": 9103866129.0, + "step": 17809 + }, + { + "epoch": 4.81611681990265, + "grad_norm": 3.640625, + "learning_rate": 0.002063957087025343, + "loss": 2.778, + "mean_token_accuracy": 0.4630396366119385, + "num_tokens": 9104390334.0, + "step": 17810 + }, + { + "epoch": 4.81638723634397, + "grad_norm": 6.59375, + "learning_rate": 0.002063769614047919, + "loss": 1.8971, + "mean_token_accuracy": 0.5461211204528809, + "num_tokens": 9104914519.0, + "step": 17811 + }, + { + "epoch": 4.816657652785289, + "grad_norm": 6.71875, + "learning_rate": 0.0020635824152603227, + "loss": 2.6985, + "mean_token_accuracy": 0.4379447102546692, + "num_tokens": 9105438760.0, + "step": 17812 + }, + { + "epoch": 4.816928069226609, + "grad_norm": 3.09375, + "learning_rate": 0.0020633954906682976, + "loss": 2.6596, + "mean_token_accuracy": 0.44631195068359375, + "num_tokens": 9105962967.0, + "step": 17813 + }, + { + "epoch": 4.817198485667928, + "grad_norm": 32.25, + "learning_rate": 0.0020632088402775806, + "loss": 2.7141, + "mean_token_accuracy": 0.4436357319355011, + "num_tokens": 9106439380.0, + "step": 17814 + }, + { + "epoch": 4.8174689021092485, + "grad_norm": 5.40625, + "learning_rate": 0.0020630224640938973, + "loss": 2.7254, + "mean_token_accuracy": 0.4585132300853729, + "num_tokens": 9106963589.0, + "step": 17815 + }, + { + "epoch": 4.817739318550568, + "grad_norm": 3.140625, + "learning_rate": 0.0020628363621229663, + "loss": 2.9198, + "mean_token_accuracy": 0.41734692454338074, + "num_tokens": 9107487849.0, + "step": 17816 + }, + { + "epoch": 4.818009734991888, + "grad_norm": 3.484375, + "learning_rate": 0.002062650534370499, + "loss": 2.7215, + "mean_token_accuracy": 0.44666868448257446, + "num_tokens": 9107982131.0, + "step": 17817 + }, + { + "epoch": 4.818280151433207, + "grad_norm": 4.03125, + "learning_rate": 0.002062464980842195, + "loss": 2.452, + "mean_token_accuracy": 0.4507668614387512, + "num_tokens": 9108506234.0, + "step": 17818 + }, + { + "epoch": 4.818550567874527, + "grad_norm": 3.828125, + "learning_rate": 0.002062279701543751, + "loss": 2.6814, + "mean_token_accuracy": 0.4511945843696594, + "num_tokens": 9109030505.0, + "step": 17819 + }, + { + "epoch": 4.818820984315846, + "grad_norm": 3.90625, + "learning_rate": 0.0020620946964808482, + "loss": 2.6282, + "mean_token_accuracy": 0.45859238505363464, + "num_tokens": 9109554686.0, + "step": 17820 + }, + { + "epoch": 4.819091400757166, + "grad_norm": 4.4375, + "learning_rate": 0.0020619099656591657, + "loss": 2.6844, + "mean_token_accuracy": 0.45231297612190247, + "num_tokens": 9110068034.0, + "step": 17821 + }, + { + "epoch": 4.8193618171984856, + "grad_norm": 4.15625, + "learning_rate": 0.0020617255090843705, + "loss": 2.9309, + "mean_token_accuracy": 0.4294910430908203, + "num_tokens": 9110574465.0, + "step": 17822 + }, + { + "epoch": 4.819632233639806, + "grad_norm": 3.96875, + "learning_rate": 0.0020615413267621218, + "loss": 2.7778, + "mean_token_accuracy": 0.43060213327407837, + "num_tokens": 9111098653.0, + "step": 17823 + }, + { + "epoch": 4.819902650081125, + "grad_norm": 3.796875, + "learning_rate": 0.002061357418698072, + "loss": 2.8348, + "mean_token_accuracy": 0.4419218897819519, + "num_tokens": 9111622912.0, + "step": 17824 + }, + { + "epoch": 4.820173066522445, + "grad_norm": 4.65625, + "learning_rate": 0.002061173784897864, + "loss": 2.8204, + "mean_token_accuracy": 0.41435864567756653, + "num_tokens": 9112147106.0, + "step": 17825 + }, + { + "epoch": 4.820443482963764, + "grad_norm": 3.03125, + "learning_rate": 0.0020609904253671304, + "loss": 2.8218, + "mean_token_accuracy": 0.4571917653083801, + "num_tokens": 9112589351.0, + "step": 17826 + }, + { + "epoch": 4.820713899405084, + "grad_norm": 4.15625, + "learning_rate": 0.0020608073401115006, + "loss": 2.6529, + "mean_token_accuracy": 0.43435725569725037, + "num_tokens": 9113113622.0, + "step": 17827 + }, + { + "epoch": 4.820984315846403, + "grad_norm": 3.65625, + "learning_rate": 0.0020606245291365883, + "loss": 2.7683, + "mean_token_accuracy": 0.439008891582489, + "num_tokens": 9113637780.0, + "step": 17828 + }, + { + "epoch": 4.8212547322877235, + "grad_norm": 4.46875, + "learning_rate": 0.002060441992448004, + "loss": 2.6316, + "mean_token_accuracy": 0.42173123359680176, + "num_tokens": 9114162041.0, + "step": 17829 + }, + { + "epoch": 4.821525148729043, + "grad_norm": 2.9375, + "learning_rate": 0.0020602597300513497, + "loss": 2.7699, + "mean_token_accuracy": 0.4329468607902527, + "num_tokens": 9114686289.0, + "step": 17830 + }, + { + "epoch": 4.821795565170362, + "grad_norm": 13.4375, + "learning_rate": 0.0020600777419522175, + "loss": 2.7126, + "mean_token_accuracy": 0.481001079082489, + "num_tokens": 9115189852.0, + "step": 17831 + }, + { + "epoch": 4.822065981611682, + "grad_norm": 7.1875, + "learning_rate": 0.002059896028156189, + "loss": 2.7691, + "mean_token_accuracy": 0.43145352602005005, + "num_tokens": 9115714068.0, + "step": 17832 + }, + { + "epoch": 4.822336398053002, + "grad_norm": 2.59375, + "learning_rate": 0.0020597145886688435, + "loss": 2.8042, + "mean_token_accuracy": 0.45915013551712036, + "num_tokens": 9116193165.0, + "step": 17833 + }, + { + "epoch": 4.822606814494321, + "grad_norm": 2.703125, + "learning_rate": 0.0020595334234957442, + "loss": 2.7643, + "mean_token_accuracy": 0.4552009105682373, + "num_tokens": 9116717436.0, + "step": 17834 + }, + { + "epoch": 4.8228772309356405, + "grad_norm": 3.40625, + "learning_rate": 0.002059352532642454, + "loss": 2.7806, + "mean_token_accuracy": 0.4414965510368347, + "num_tokens": 9117241572.0, + "step": 17835 + }, + { + "epoch": 4.823147647376961, + "grad_norm": 3.90625, + "learning_rate": 0.002059171916114519, + "loss": 2.8496, + "mean_token_accuracy": 0.4446069896221161, + "num_tokens": 9117726868.0, + "step": 17836 + }, + { + "epoch": 4.823418063818281, + "grad_norm": 4.0625, + "learning_rate": 0.0020589915739174834, + "loss": 2.9616, + "mean_token_accuracy": 0.4210470914840698, + "num_tokens": 9118251076.0, + "step": 17837 + }, + { + "epoch": 4.8236884802596, + "grad_norm": 3.40625, + "learning_rate": 0.00205881150605688, + "loss": 2.8656, + "mean_token_accuracy": 0.4474138617515564, + "num_tokens": 9118753698.0, + "step": 17838 + }, + { + "epoch": 4.823958896700919, + "grad_norm": 3.75, + "learning_rate": 0.0020586317125382346, + "loss": 2.701, + "mean_token_accuracy": 0.47073930501937866, + "num_tokens": 9119277870.0, + "step": 17839 + }, + { + "epoch": 4.824229313142239, + "grad_norm": 3.875, + "learning_rate": 0.0020584521933670625, + "loss": 2.8988, + "mean_token_accuracy": 0.43947237730026245, + "num_tokens": 9119802149.0, + "step": 17840 + }, + { + "epoch": 4.824499729583558, + "grad_norm": 4.65625, + "learning_rate": 0.002058272948548872, + "loss": 2.632, + "mean_token_accuracy": 0.4998527765274048, + "num_tokens": 9120261767.0, + "step": 17841 + }, + { + "epoch": 4.824770146024878, + "grad_norm": 3.71875, + "learning_rate": 0.0020580939780891643, + "loss": 2.8618, + "mean_token_accuracy": 0.4485732913017273, + "num_tokens": 9120778079.0, + "step": 17842 + }, + { + "epoch": 4.825040562466198, + "grad_norm": 3.90625, + "learning_rate": 0.0020579152819934295, + "loss": 2.7658, + "mean_token_accuracy": 0.43815040588378906, + "num_tokens": 9121302341.0, + "step": 17843 + }, + { + "epoch": 4.825310978907518, + "grad_norm": 3.953125, + "learning_rate": 0.002057736860267151, + "loss": 2.6254, + "mean_token_accuracy": 0.45013850927352905, + "num_tokens": 9121826546.0, + "step": 17844 + }, + { + "epoch": 4.825581395348837, + "grad_norm": 3.84375, + "learning_rate": 0.0020575587129158025, + "loss": 2.7986, + "mean_token_accuracy": 0.432563841342926, + "num_tokens": 9122350727.0, + "step": 17845 + }, + { + "epoch": 4.825851811790157, + "grad_norm": 4.96875, + "learning_rate": 0.00205738083994485, + "loss": 2.7463, + "mean_token_accuracy": 0.4562138020992279, + "num_tokens": 9122874905.0, + "step": 17846 + }, + { + "epoch": 4.826122228231476, + "grad_norm": 3.390625, + "learning_rate": 0.002057203241359754, + "loss": 2.6404, + "mean_token_accuracy": 0.42496687173843384, + "num_tokens": 9123399167.0, + "step": 17847 + }, + { + "epoch": 4.826392644672796, + "grad_norm": 3.5, + "learning_rate": 0.0020570259171659605, + "loss": 2.7697, + "mean_token_accuracy": 0.4376585781574249, + "num_tokens": 9123897991.0, + "step": 17848 + }, + { + "epoch": 4.8266630611141155, + "grad_norm": 3.59375, + "learning_rate": 0.002056848867368911, + "loss": 2.5855, + "mean_token_accuracy": 0.4632432162761688, + "num_tokens": 9124382301.0, + "step": 17849 + }, + { + "epoch": 4.826933477555436, + "grad_norm": 3.71875, + "learning_rate": 0.0020566720919740385, + "loss": 2.8512, + "mean_token_accuracy": 0.43543630838394165, + "num_tokens": 9124855703.0, + "step": 17850 + }, + { + "epoch": 4.827203893996755, + "grad_norm": 87.5, + "learning_rate": 0.002056495590986767, + "loss": 3.5205, + "mean_token_accuracy": 0.4014880061149597, + "num_tokens": 9125379835.0, + "step": 17851 + }, + { + "epoch": 4.827474310438075, + "grad_norm": 7.5, + "learning_rate": 0.0020563193644125115, + "loss": 2.8354, + "mean_token_accuracy": 0.42589861154556274, + "num_tokens": 9125904117.0, + "step": 17852 + }, + { + "epoch": 4.827744726879394, + "grad_norm": 2.671875, + "learning_rate": 0.0020561434122566805, + "loss": 3.0114, + "mean_token_accuracy": 0.41681450605392456, + "num_tokens": 9126428346.0, + "step": 17853 + }, + { + "epoch": 4.828015143320714, + "grad_norm": 3.953125, + "learning_rate": 0.0020559677345246717, + "loss": 2.4841, + "mean_token_accuracy": 0.45171764492988586, + "num_tokens": 9126952628.0, + "step": 17854 + }, + { + "epoch": 4.828285559762033, + "grad_norm": 2.890625, + "learning_rate": 0.0020557923312218744, + "loss": 2.7519, + "mean_token_accuracy": 0.44587284326553345, + "num_tokens": 9127476866.0, + "step": 17855 + }, + { + "epoch": 4.828555976203353, + "grad_norm": 4.15625, + "learning_rate": 0.0020556172023536733, + "loss": 2.8106, + "mean_token_accuracy": 0.43902599811553955, + "num_tokens": 9128001130.0, + "step": 17856 + }, + { + "epoch": 4.828826392644673, + "grad_norm": 4.21875, + "learning_rate": 0.0020554423479254387, + "loss": 2.6612, + "mean_token_accuracy": 0.4655143618583679, + "num_tokens": 9128464751.0, + "step": 17857 + }, + { + "epoch": 4.829096809085993, + "grad_norm": 4.28125, + "learning_rate": 0.0020552677679425376, + "loss": 2.707, + "mean_token_accuracy": 0.4478174149990082, + "num_tokens": 9128988903.0, + "step": 17858 + }, + { + "epoch": 4.829367225527312, + "grad_norm": 4.09375, + "learning_rate": 0.0020550934624103258, + "loss": 2.8326, + "mean_token_accuracy": 0.43052536249160767, + "num_tokens": 9129513182.0, + "step": 17859 + }, + { + "epoch": 4.829637641968632, + "grad_norm": 4.40625, + "learning_rate": 0.0020549194313341517, + "loss": 2.7536, + "mean_token_accuracy": 0.46041804552078247, + "num_tokens": 9129943824.0, + "step": 17860 + }, + { + "epoch": 4.829908058409951, + "grad_norm": 4.03125, + "learning_rate": 0.0020547456747193557, + "loss": 2.8128, + "mean_token_accuracy": 0.4145355224609375, + "num_tokens": 9130468104.0, + "step": 17861 + }, + { + "epoch": 4.830178474851271, + "grad_norm": 3.734375, + "learning_rate": 0.0020545721925712686, + "loss": 2.5625, + "mean_token_accuracy": 0.4459722340106964, + "num_tokens": 9130992298.0, + "step": 17862 + }, + { + "epoch": 4.8304488912925905, + "grad_norm": 2.875, + "learning_rate": 0.0020543989848952134, + "loss": 2.8603, + "mean_token_accuracy": 0.4387897253036499, + "num_tokens": 9131516516.0, + "step": 17863 + }, + { + "epoch": 4.830719307733911, + "grad_norm": 4.59375, + "learning_rate": 0.0020542260516965044, + "loss": 2.6494, + "mean_token_accuracy": 0.4550749659538269, + "num_tokens": 9132040768.0, + "step": 17864 + }, + { + "epoch": 4.83098972417523, + "grad_norm": 3.3125, + "learning_rate": 0.0020540533929804482, + "loss": 2.7115, + "mean_token_accuracy": 0.44745147228240967, + "num_tokens": 9132564976.0, + "step": 17865 + }, + { + "epoch": 4.83126014061655, + "grad_norm": 3.984375, + "learning_rate": 0.002053881008752342, + "loss": 2.5785, + "mean_token_accuracy": 0.4497683346271515, + "num_tokens": 9133040155.0, + "step": 17866 + }, + { + "epoch": 4.831530557057869, + "grad_norm": 3.40625, + "learning_rate": 0.0020537088990174764, + "loss": 2.5607, + "mean_token_accuracy": 0.4738541841506958, + "num_tokens": 9133553086.0, + "step": 17867 + }, + { + "epoch": 4.831800973499189, + "grad_norm": 4.65625, + "learning_rate": 0.00205353706378113, + "loss": 2.9266, + "mean_token_accuracy": 0.4262191951274872, + "num_tokens": 9134077367.0, + "step": 17868 + }, + { + "epoch": 4.832071389940508, + "grad_norm": 4.15625, + "learning_rate": 0.002053365503048576, + "loss": 2.8401, + "mean_token_accuracy": 0.4324037730693817, + "num_tokens": 9134600658.0, + "step": 17869 + }, + { + "epoch": 4.832341806381828, + "grad_norm": 4.21875, + "learning_rate": 0.002053194216825079, + "loss": 2.6816, + "mean_token_accuracy": 0.4423450827598572, + "num_tokens": 9135122868.0, + "step": 17870 + }, + { + "epoch": 4.832612222823148, + "grad_norm": 7.53125, + "learning_rate": 0.0020530232051158945, + "loss": 2.0146, + "mean_token_accuracy": 0.538915753364563, + "num_tokens": 9135647153.0, + "step": 17871 + }, + { + "epoch": 4.832882639264467, + "grad_norm": 6.21875, + "learning_rate": 0.00205285246792627, + "loss": 2.8786, + "mean_token_accuracy": 0.44351959228515625, + "num_tokens": 9136171413.0, + "step": 17872 + }, + { + "epoch": 4.833153055705787, + "grad_norm": 3.390625, + "learning_rate": 0.0020526820052614424, + "loss": 2.8058, + "mean_token_accuracy": 0.44784829020500183, + "num_tokens": 9136693590.0, + "step": 17873 + }, + { + "epoch": 4.833423472147107, + "grad_norm": 4.0625, + "learning_rate": 0.0020525118171266436, + "loss": 2.793, + "mean_token_accuracy": 0.44551581144332886, + "num_tokens": 9137217866.0, + "step": 17874 + }, + { + "epoch": 4.833693888588426, + "grad_norm": 4.46875, + "learning_rate": 0.002052341903527095, + "loss": 2.8491, + "mean_token_accuracy": 0.4414021968841553, + "num_tokens": 9137742111.0, + "step": 17875 + }, + { + "epoch": 4.833964305029745, + "grad_norm": 4.09375, + "learning_rate": 0.0020521722644680114, + "loss": 2.8501, + "mean_token_accuracy": 0.4393823742866516, + "num_tokens": 9138266220.0, + "step": 17876 + }, + { + "epoch": 4.8342347214710655, + "grad_norm": 4.59375, + "learning_rate": 0.0020520028999545955, + "loss": 2.7406, + "mean_token_accuracy": 0.4505366384983063, + "num_tokens": 9138790498.0, + "step": 17877 + }, + { + "epoch": 4.834505137912386, + "grad_norm": 3.203125, + "learning_rate": 0.0020518338099920447, + "loss": 2.5979, + "mean_token_accuracy": 0.457781583070755, + "num_tokens": 9139308015.0, + "step": 17878 + }, + { + "epoch": 4.834775554353705, + "grad_norm": 3.5, + "learning_rate": 0.002051664994585548, + "loss": 2.6492, + "mean_token_accuracy": 0.44905614852905273, + "num_tokens": 9139832261.0, + "step": 17879 + }, + { + "epoch": 4.835045970795024, + "grad_norm": 4.25, + "learning_rate": 0.002051496453740284, + "loss": 2.9349, + "mean_token_accuracy": 0.4354085326194763, + "num_tokens": 9140319813.0, + "step": 17880 + }, + { + "epoch": 4.835316387236344, + "grad_norm": 3.671875, + "learning_rate": 0.0020513281874614256, + "loss": 2.6788, + "mean_token_accuracy": 0.4516758918762207, + "num_tokens": 9140843952.0, + "step": 17881 + }, + { + "epoch": 4.835586803677663, + "grad_norm": 3.34375, + "learning_rate": 0.0020511601957541345, + "loss": 2.7596, + "mean_token_accuracy": 0.441888689994812, + "num_tokens": 9141368207.0, + "step": 17882 + }, + { + "epoch": 4.835857220118983, + "grad_norm": 4.4375, + "learning_rate": 0.002050992478623565, + "loss": 2.5982, + "mean_token_accuracy": 0.45752793550491333, + "num_tokens": 9141892479.0, + "step": 17883 + }, + { + "epoch": 4.8361276365603025, + "grad_norm": 4.90625, + "learning_rate": 0.0020508250360748636, + "loss": 2.8268, + "mean_token_accuracy": 0.44618910551071167, + "num_tokens": 9142416762.0, + "step": 17884 + }, + { + "epoch": 4.836398053001623, + "grad_norm": 3.921875, + "learning_rate": 0.002050657868113168, + "loss": 2.8559, + "mean_token_accuracy": 0.43835893273353577, + "num_tokens": 9142914022.0, + "step": 17885 + }, + { + "epoch": 4.836668469442942, + "grad_norm": 4.34375, + "learning_rate": 0.002050490974743607, + "loss": 2.8356, + "mean_token_accuracy": 0.43284010887145996, + "num_tokens": 9143438168.0, + "step": 17886 + }, + { + "epoch": 4.836938885884262, + "grad_norm": 3.4375, + "learning_rate": 0.002050324355971303, + "loss": 2.7145, + "mean_token_accuracy": 0.43998783826828003, + "num_tokens": 9143962378.0, + "step": 17887 + }, + { + "epoch": 4.837209302325581, + "grad_norm": 3.078125, + "learning_rate": 0.0020501580118013653, + "loss": 2.7408, + "mean_token_accuracy": 0.447374552488327, + "num_tokens": 9144478990.0, + "step": 17888 + }, + { + "epoch": 4.837479718766901, + "grad_norm": 4.46875, + "learning_rate": 0.0020499919422388995, + "loss": 2.7745, + "mean_token_accuracy": 0.4399198293685913, + "num_tokens": 9145003166.0, + "step": 17889 + }, + { + "epoch": 4.83775013520822, + "grad_norm": 3.9375, + "learning_rate": 0.0020498261472890024, + "loss": 2.6208, + "mean_token_accuracy": 0.4539700448513031, + "num_tokens": 9145527343.0, + "step": 17890 + }, + { + "epoch": 4.8380205516495405, + "grad_norm": 35.75, + "learning_rate": 0.0020496606269567604, + "loss": 2.5108, + "mean_token_accuracy": 0.49918004870414734, + "num_tokens": 9146051613.0, + "step": 17891 + }, + { + "epoch": 4.83829096809086, + "grad_norm": 8.75, + "learning_rate": 0.0020494953812472496, + "loss": 2.6561, + "mean_token_accuracy": 0.4718894064426422, + "num_tokens": 9146515046.0, + "step": 17892 + }, + { + "epoch": 4.83856138453218, + "grad_norm": 3.171875, + "learning_rate": 0.002049330410165544, + "loss": 2.7893, + "mean_token_accuracy": 0.4364704489707947, + "num_tokens": 9147032783.0, + "step": 17893 + }, + { + "epoch": 4.838831800973499, + "grad_norm": 4.15625, + "learning_rate": 0.0020491657137167033, + "loss": 2.891, + "mean_token_accuracy": 0.39461904764175415, + "num_tokens": 9147556929.0, + "step": 17894 + }, + { + "epoch": 4.839102217414819, + "grad_norm": 3.375, + "learning_rate": 0.00204900129190578, + "loss": 2.7274, + "mean_token_accuracy": 0.42873626947402954, + "num_tokens": 9148081144.0, + "step": 17895 + }, + { + "epoch": 4.839372633856138, + "grad_norm": 3.34375, + "learning_rate": 0.002048837144737822, + "loss": 2.8281, + "mean_token_accuracy": 0.4379808306694031, + "num_tokens": 9148605410.0, + "step": 17896 + }, + { + "epoch": 4.839643050297458, + "grad_norm": 3.546875, + "learning_rate": 0.002048673272217862, + "loss": 2.846, + "mean_token_accuracy": 0.44148170948028564, + "num_tokens": 9149129679.0, + "step": 17897 + }, + { + "epoch": 4.8399134667387775, + "grad_norm": 4.4375, + "learning_rate": 0.0020485096743509314, + "loss": 2.7122, + "mean_token_accuracy": 0.4584049582481384, + "num_tokens": 9149653824.0, + "step": 17898 + }, + { + "epoch": 4.840183883180098, + "grad_norm": 4.0625, + "learning_rate": 0.002048346351142048, + "loss": 2.6854, + "mean_token_accuracy": 0.4575437903404236, + "num_tokens": 9150119120.0, + "step": 17899 + }, + { + "epoch": 4.840454299621417, + "grad_norm": 3.828125, + "learning_rate": 0.0020481833025962244, + "loss": 2.834, + "mean_token_accuracy": 0.43227705359458923, + "num_tokens": 9150643361.0, + "step": 17900 + }, + { + "epoch": 4.840724716062737, + "grad_norm": 4.375, + "learning_rate": 0.002048020528718462, + "loss": 2.9111, + "mean_token_accuracy": 0.43343645334243774, + "num_tokens": 9151167603.0, + "step": 17901 + }, + { + "epoch": 4.840995132504056, + "grad_norm": 4.03125, + "learning_rate": 0.0020478580295137565, + "loss": 2.888, + "mean_token_accuracy": 0.4358031153678894, + "num_tokens": 9151682027.0, + "step": 17902 + }, + { + "epoch": 4.841265548945376, + "grad_norm": 3.78125, + "learning_rate": 0.0020476958049870926, + "loss": 2.6731, + "mean_token_accuracy": 0.43732750415802, + "num_tokens": 9152206200.0, + "step": 17903 + }, + { + "epoch": 4.841535965386695, + "grad_norm": 3.890625, + "learning_rate": 0.0020475338551434485, + "loss": 2.6067, + "mean_token_accuracy": 0.45942872762680054, + "num_tokens": 9152730464.0, + "step": 17904 + }, + { + "epoch": 4.8418063818280155, + "grad_norm": 4.4375, + "learning_rate": 0.0020473721799877938, + "loss": 2.7522, + "mean_token_accuracy": 0.44237416982650757, + "num_tokens": 9153254700.0, + "step": 17905 + }, + { + "epoch": 4.842076798269335, + "grad_norm": 3.34375, + "learning_rate": 0.0020472107795250884, + "loss": 2.6626, + "mean_token_accuracy": 0.4788113534450531, + "num_tokens": 9153740497.0, + "step": 17906 + }, + { + "epoch": 4.842347214710655, + "grad_norm": 3.703125, + "learning_rate": 0.0020470496537602838, + "loss": 2.7184, + "mean_token_accuracy": 0.45409268140792847, + "num_tokens": 9154264618.0, + "step": 17907 + }, + { + "epoch": 4.842617631151974, + "grad_norm": 4.90625, + "learning_rate": 0.0020468888026983255, + "loss": 2.7424, + "mean_token_accuracy": 0.46159303188323975, + "num_tokens": 9154788882.0, + "step": 17908 + }, + { + "epoch": 4.842888047593294, + "grad_norm": 3.828125, + "learning_rate": 0.0020467282263441485, + "loss": 2.7591, + "mean_token_accuracy": 0.4362473785877228, + "num_tokens": 9155312978.0, + "step": 17909 + }, + { + "epoch": 4.843158464034613, + "grad_norm": 4.15625, + "learning_rate": 0.0020465679247026786, + "loss": 2.7847, + "mean_token_accuracy": 0.44616079330444336, + "num_tokens": 9155808115.0, + "step": 17910 + }, + { + "epoch": 4.843428880475933, + "grad_norm": 19.25, + "learning_rate": 0.0020464078977788353, + "loss": 2.351, + "mean_token_accuracy": 0.4568437933921814, + "num_tokens": 9156332388.0, + "step": 17911 + }, + { + "epoch": 4.8436992969172525, + "grad_norm": 7.03125, + "learning_rate": 0.002046248145577529, + "loss": 2.6941, + "mean_token_accuracy": 0.4323206841945648, + "num_tokens": 9156856667.0, + "step": 17912 + }, + { + "epoch": 4.843969713358572, + "grad_norm": 2.625, + "learning_rate": 0.00204608866810366, + "loss": 2.6409, + "mean_token_accuracy": 0.45252686738967896, + "num_tokens": 9157380912.0, + "step": 17913 + }, + { + "epoch": 4.844240129799892, + "grad_norm": 3.09375, + "learning_rate": 0.002045929465362123, + "loss": 2.8642, + "mean_token_accuracy": 0.4526096284389496, + "num_tokens": 9157846234.0, + "step": 17914 + }, + { + "epoch": 4.844510546241212, + "grad_norm": 3.625, + "learning_rate": 0.0020457705373578025, + "loss": 2.7559, + "mean_token_accuracy": 0.45174160599708557, + "num_tokens": 9158370447.0, + "step": 17915 + }, + { + "epoch": 4.844780962682531, + "grad_norm": 4.28125, + "learning_rate": 0.002045611884095574, + "loss": 2.7513, + "mean_token_accuracy": 0.4483710527420044, + "num_tokens": 9158894626.0, + "step": 17916 + }, + { + "epoch": 4.84505137912385, + "grad_norm": 3.78125, + "learning_rate": 0.002045453505580306, + "loss": 2.871, + "mean_token_accuracy": 0.43766021728515625, + "num_tokens": 9159383023.0, + "step": 17917 + }, + { + "epoch": 4.84532179556517, + "grad_norm": 3.40625, + "learning_rate": 0.002045295401816858, + "loss": 2.5127, + "mean_token_accuracy": 0.4631640911102295, + "num_tokens": 9159875821.0, + "step": 17918 + }, + { + "epoch": 4.8455922120064905, + "grad_norm": 3.265625, + "learning_rate": 0.002045137572810081, + "loss": 2.6248, + "mean_token_accuracy": 0.46659615635871887, + "num_tokens": 9160399987.0, + "step": 17919 + }, + { + "epoch": 4.84586262844781, + "grad_norm": 30.875, + "learning_rate": 0.0020449800185648186, + "loss": 2.8865, + "mean_token_accuracy": 0.4362449645996094, + "num_tokens": 9160924261.0, + "step": 17920 + }, + { + "epoch": 4.846133044889129, + "grad_norm": 6.84375, + "learning_rate": 0.0020448227390859044, + "loss": 2.9925, + "mean_token_accuracy": 0.42079806327819824, + "num_tokens": 9161430035.0, + "step": 17921 + }, + { + "epoch": 4.846403461330449, + "grad_norm": 3.8125, + "learning_rate": 0.002044665734378163, + "loss": 2.9334, + "mean_token_accuracy": 0.4469502568244934, + "num_tokens": 9161929558.0, + "step": 17922 + }, + { + "epoch": 4.846673877771768, + "grad_norm": 4.75, + "learning_rate": 0.0020445090044464136, + "loss": 2.7534, + "mean_token_accuracy": 0.43546149134635925, + "num_tokens": 9162403934.0, + "step": 17923 + }, + { + "epoch": 4.846944294213088, + "grad_norm": 3.671875, + "learning_rate": 0.002044352549295464, + "loss": 2.6554, + "mean_token_accuracy": 0.4623051583766937, + "num_tokens": 9162928202.0, + "step": 17924 + }, + { + "epoch": 4.8472147106544075, + "grad_norm": 3.609375, + "learning_rate": 0.002044196368930114, + "loss": 2.6739, + "mean_token_accuracy": 0.43451234698295593, + "num_tokens": 9163427573.0, + "step": 17925 + }, + { + "epoch": 4.8474851270957275, + "grad_norm": 4.03125, + "learning_rate": 0.002044040463355158, + "loss": 2.712, + "mean_token_accuracy": 0.45828962326049805, + "num_tokens": 9163888691.0, + "step": 17926 + }, + { + "epoch": 4.847755543537047, + "grad_norm": 4.03125, + "learning_rate": 0.0020438848325753786, + "loss": 2.8256, + "mean_token_accuracy": 0.4374670684337616, + "num_tokens": 9164412929.0, + "step": 17927 + }, + { + "epoch": 4.848025959978367, + "grad_norm": 4.28125, + "learning_rate": 0.00204372947659555, + "loss": 2.7543, + "mean_token_accuracy": 0.43922704458236694, + "num_tokens": 9164937211.0, + "step": 17928 + }, + { + "epoch": 4.848296376419686, + "grad_norm": 4.15625, + "learning_rate": 0.0020435743954204393, + "loss": 2.8548, + "mean_token_accuracy": 0.44101572036743164, + "num_tokens": 9165461376.0, + "step": 17929 + }, + { + "epoch": 4.848566792861006, + "grad_norm": 3.8125, + "learning_rate": 0.002043419589054807, + "loss": 2.7401, + "mean_token_accuracy": 0.45294344425201416, + "num_tokens": 9165985547.0, + "step": 17930 + }, + { + "epoch": 4.848837209302325, + "grad_norm": 14.125, + "learning_rate": 0.0020432650575034, + "loss": 2.226, + "mean_token_accuracy": 0.4930357038974762, + "num_tokens": 9166504140.0, + "step": 17931 + }, + { + "epoch": 4.849107625743645, + "grad_norm": 8.0625, + "learning_rate": 0.0020431108007709603, + "loss": 2.8989, + "mean_token_accuracy": 0.4338020086288452, + "num_tokens": 9167028423.0, + "step": 17932 + }, + { + "epoch": 4.849378042184965, + "grad_norm": 2.953125, + "learning_rate": 0.0020429568188622227, + "loss": 2.8169, + "mean_token_accuracy": 0.426761269569397, + "num_tokens": 9167552545.0, + "step": 17933 + }, + { + "epoch": 4.849648458626285, + "grad_norm": 4.5625, + "learning_rate": 0.002042803111781911, + "loss": 2.7304, + "mean_token_accuracy": 0.45726507902145386, + "num_tokens": 9168034078.0, + "step": 17934 + }, + { + "epoch": 4.849918875067604, + "grad_norm": 4.65625, + "learning_rate": 0.0020426496795347415, + "loss": 2.5624, + "mean_token_accuracy": 0.4595932364463806, + "num_tokens": 9168558359.0, + "step": 17935 + }, + { + "epoch": 4.850189291508924, + "grad_norm": 3.953125, + "learning_rate": 0.0020424965221254214, + "loss": 2.7065, + "mean_token_accuracy": 0.43917974829673767, + "num_tokens": 9169082614.0, + "step": 17936 + }, + { + "epoch": 4.850459707950243, + "grad_norm": 4.28125, + "learning_rate": 0.0020423436395586502, + "loss": 2.7416, + "mean_token_accuracy": 0.44391217827796936, + "num_tokens": 9169597293.0, + "step": 17937 + }, + { + "epoch": 4.850730124391563, + "grad_norm": 4.46875, + "learning_rate": 0.002042191031839119, + "loss": 2.7983, + "mean_token_accuracy": 0.4192750155925751, + "num_tokens": 9170121478.0, + "step": 17938 + }, + { + "epoch": 4.8510005408328825, + "grad_norm": 3.515625, + "learning_rate": 0.00204203869897151, + "loss": 2.8048, + "mean_token_accuracy": 0.44743794202804565, + "num_tokens": 9170645757.0, + "step": 17939 + }, + { + "epoch": 4.8512709572742025, + "grad_norm": 4.78125, + "learning_rate": 0.002041886640960497, + "loss": 2.5887, + "mean_token_accuracy": 0.4618278741836548, + "num_tokens": 9171110941.0, + "step": 17940 + }, + { + "epoch": 4.851541373715522, + "grad_norm": 3.5625, + "learning_rate": 0.0020417348578107464, + "loss": 2.7577, + "mean_token_accuracy": 0.4472373425960541, + "num_tokens": 9171588874.0, + "step": 17941 + }, + { + "epoch": 4.851811790156842, + "grad_norm": 4.34375, + "learning_rate": 0.0020415833495269144, + "loss": 2.7197, + "mean_token_accuracy": 0.4623093605041504, + "num_tokens": 9172069434.0, + "step": 17942 + }, + { + "epoch": 4.852082206598161, + "grad_norm": 3.875, + "learning_rate": 0.0020414321161136514, + "loss": 2.8987, + "mean_token_accuracy": 0.42685192823410034, + "num_tokens": 9172593701.0, + "step": 17943 + }, + { + "epoch": 4.852352623039481, + "grad_norm": 4.53125, + "learning_rate": 0.002041281157575595, + "loss": 2.8409, + "mean_token_accuracy": 0.43776625394821167, + "num_tokens": 9173117944.0, + "step": 17944 + }, + { + "epoch": 4.8526230394808, + "grad_norm": 3.84375, + "learning_rate": 0.0020411304739173792, + "loss": 2.9272, + "mean_token_accuracy": 0.4206479489803314, + "num_tokens": 9173642198.0, + "step": 17945 + }, + { + "epoch": 4.85289345592212, + "grad_norm": 3.421875, + "learning_rate": 0.002040980065143627, + "loss": 2.9203, + "mean_token_accuracy": 0.4264167845249176, + "num_tokens": 9174134337.0, + "step": 17946 + }, + { + "epoch": 4.85316387236344, + "grad_norm": 3.453125, + "learning_rate": 0.002040829931258953, + "loss": 2.772, + "mean_token_accuracy": 0.41607195138931274, + "num_tokens": 9174658502.0, + "step": 17947 + }, + { + "epoch": 4.85343428880476, + "grad_norm": 3.390625, + "learning_rate": 0.0020406800722679646, + "loss": 2.8559, + "mean_token_accuracy": 0.44184327125549316, + "num_tokens": 9175154414.0, + "step": 17948 + }, + { + "epoch": 4.853704705246079, + "grad_norm": 4.21875, + "learning_rate": 0.002040530488175258, + "loss": 2.8342, + "mean_token_accuracy": 0.44208472967147827, + "num_tokens": 9175678602.0, + "step": 17949 + }, + { + "epoch": 4.853975121687399, + "grad_norm": 3.984375, + "learning_rate": 0.002040381178985425, + "loss": 2.7265, + "mean_token_accuracy": 0.42422911524772644, + "num_tokens": 9176202704.0, + "step": 17950 + }, + { + "epoch": 4.854245538128718, + "grad_norm": 17.375, + "learning_rate": 0.0020402321447030463, + "loss": 2.3441, + "mean_token_accuracy": 0.4951343536376953, + "num_tokens": 9176692863.0, + "step": 17951 + }, + { + "epoch": 4.854515954570038, + "grad_norm": 7.28125, + "learning_rate": 0.002040083385332693, + "loss": 2.8162, + "mean_token_accuracy": 0.44757750630378723, + "num_tokens": 9177217036.0, + "step": 17952 + }, + { + "epoch": 4.8547863710113575, + "grad_norm": 3.140625, + "learning_rate": 0.0020399349008789315, + "loss": 2.6949, + "mean_token_accuracy": 0.44771748781204224, + "num_tokens": 9177741229.0, + "step": 17953 + }, + { + "epoch": 4.855056787452677, + "grad_norm": 3.984375, + "learning_rate": 0.002039786691346317, + "loss": 3.007, + "mean_token_accuracy": 0.42858994007110596, + "num_tokens": 9178245258.0, + "step": 17954 + }, + { + "epoch": 4.855327203893997, + "grad_norm": 4.4375, + "learning_rate": 0.0020396387567393976, + "loss": 2.8703, + "mean_token_accuracy": 0.4215629994869232, + "num_tokens": 9178769544.0, + "step": 17955 + }, + { + "epoch": 4.855597620335317, + "grad_norm": 5.78125, + "learning_rate": 0.0020394910970627108, + "loss": 2.4852, + "mean_token_accuracy": 0.4676436483860016, + "num_tokens": 9179293756.0, + "step": 17956 + }, + { + "epoch": 4.855868036776636, + "grad_norm": 4.375, + "learning_rate": 0.00203934371232079, + "loss": 2.6562, + "mean_token_accuracy": 0.44935888051986694, + "num_tokens": 9179817957.0, + "step": 17957 + }, + { + "epoch": 4.856138453217955, + "grad_norm": 4.3125, + "learning_rate": 0.0020391966025181544, + "loss": 2.7079, + "mean_token_accuracy": 0.45530638098716736, + "num_tokens": 9180307647.0, + "step": 17958 + }, + { + "epoch": 4.856408869659275, + "grad_norm": 5.34375, + "learning_rate": 0.0020390497676593183, + "loss": 2.786, + "mean_token_accuracy": 0.4405704736709595, + "num_tokens": 9180831825.0, + "step": 17959 + }, + { + "epoch": 4.856679286100595, + "grad_norm": 4.34375, + "learning_rate": 0.0020389032077487886, + "loss": 2.9448, + "mean_token_accuracy": 0.43589168787002563, + "num_tokens": 9181280367.0, + "step": 17960 + }, + { + "epoch": 4.856949702541915, + "grad_norm": 4.15625, + "learning_rate": 0.0020387569227910615, + "loss": 2.7139, + "mean_token_accuracy": 0.4340475797653198, + "num_tokens": 9181777051.0, + "step": 17961 + }, + { + "epoch": 4.857220118983234, + "grad_norm": 3.609375, + "learning_rate": 0.0020386109127906254, + "loss": 2.7999, + "mean_token_accuracy": 0.4422278106212616, + "num_tokens": 9182301240.0, + "step": 17962 + }, + { + "epoch": 4.857490535424554, + "grad_norm": 3.453125, + "learning_rate": 0.0020384651777519604, + "loss": 2.6753, + "mean_token_accuracy": 0.44749265909194946, + "num_tokens": 9182825515.0, + "step": 17963 + }, + { + "epoch": 4.857760951865873, + "grad_norm": 3.8125, + "learning_rate": 0.002038319717679537, + "loss": 2.8137, + "mean_token_accuracy": 0.43317073583602905, + "num_tokens": 9183329279.0, + "step": 17964 + }, + { + "epoch": 4.858031368307193, + "grad_norm": 3.78125, + "learning_rate": 0.0020381745325778206, + "loss": 2.9572, + "mean_token_accuracy": 0.46037817001342773, + "num_tokens": 9183788027.0, + "step": 17965 + }, + { + "epoch": 4.858301784748512, + "grad_norm": 4.125, + "learning_rate": 0.0020380296224512644, + "loss": 2.6467, + "mean_token_accuracy": 0.44121965765953064, + "num_tokens": 9184284843.0, + "step": 17966 + }, + { + "epoch": 4.8585722011898325, + "grad_norm": 3.734375, + "learning_rate": 0.002037884987304313, + "loss": 2.8588, + "mean_token_accuracy": 0.4265061914920807, + "num_tokens": 9184800568.0, + "step": 17967 + }, + { + "epoch": 4.858842617631152, + "grad_norm": 3.125, + "learning_rate": 0.002037740627141407, + "loss": 2.8426, + "mean_token_accuracy": 0.42842039465904236, + "num_tokens": 9185324850.0, + "step": 17968 + }, + { + "epoch": 4.859113034072472, + "grad_norm": 3.53125, + "learning_rate": 0.0020375965419669757, + "loss": 2.6956, + "mean_token_accuracy": 0.45347654819488525, + "num_tokens": 9185849025.0, + "step": 17969 + }, + { + "epoch": 4.859383450513791, + "grad_norm": 3.640625, + "learning_rate": 0.0020374527317854386, + "loss": 2.5029, + "mean_token_accuracy": 0.45340073108673096, + "num_tokens": 9186373202.0, + "step": 17970 + }, + { + "epoch": 4.859653866955111, + "grad_norm": 11.875, + "learning_rate": 0.002037309196601209, + "loss": 2.4857, + "mean_token_accuracy": 0.488952100276947, + "num_tokens": 9186888690.0, + "step": 17971 + }, + { + "epoch": 4.85992428339643, + "grad_norm": 6.1875, + "learning_rate": 0.0020371659364186905, + "loss": 2.9068, + "mean_token_accuracy": 0.43897759914398193, + "num_tokens": 9187412970.0, + "step": 17972 + }, + { + "epoch": 4.86019469983775, + "grad_norm": 3.90625, + "learning_rate": 0.0020370229512422793, + "loss": 2.7954, + "mean_token_accuracy": 0.4384539723396301, + "num_tokens": 9187931929.0, + "step": 17973 + }, + { + "epoch": 4.8604651162790695, + "grad_norm": 3.640625, + "learning_rate": 0.0020368802410763616, + "loss": 2.8099, + "mean_token_accuracy": 0.43114137649536133, + "num_tokens": 9188456066.0, + "step": 17974 + }, + { + "epoch": 4.86073553272039, + "grad_norm": 3.765625, + "learning_rate": 0.002036737805925318, + "loss": 2.7898, + "mean_token_accuracy": 0.48236626386642456, + "num_tokens": 9188915413.0, + "step": 17975 + }, + { + "epoch": 4.861005949161709, + "grad_norm": 3.4375, + "learning_rate": 0.0020365956457935176, + "loss": 2.8644, + "mean_token_accuracy": 0.4379361569881439, + "num_tokens": 9189439671.0, + "step": 17976 + }, + { + "epoch": 4.861276365603029, + "grad_norm": 4.0625, + "learning_rate": 0.002036453760685322, + "loss": 2.7956, + "mean_token_accuracy": 0.44628793001174927, + "num_tokens": 9189963930.0, + "step": 17977 + }, + { + "epoch": 4.861546782044348, + "grad_norm": 68.0, + "learning_rate": 0.002036312150605085, + "loss": 2.7612, + "mean_token_accuracy": 0.44140034914016724, + "num_tokens": 9190488084.0, + "step": 17978 + }, + { + "epoch": 4.861817198485668, + "grad_norm": 17.75, + "learning_rate": 0.002036170815557152, + "loss": 2.6345, + "mean_token_accuracy": 0.4675309360027313, + "num_tokens": 9191012364.0, + "step": 17979 + }, + { + "epoch": 4.862087614926987, + "grad_norm": 4.78125, + "learning_rate": 0.0020360297555458593, + "loss": 2.8429, + "mean_token_accuracy": 0.42610853910446167, + "num_tokens": 9191536532.0, + "step": 17980 + }, + { + "epoch": 4.8623580313683075, + "grad_norm": 3.6875, + "learning_rate": 0.002035888970575535, + "loss": 2.6369, + "mean_token_accuracy": 0.4615510106086731, + "num_tokens": 9192060797.0, + "step": 17981 + }, + { + "epoch": 4.862628447809627, + "grad_norm": 3.140625, + "learning_rate": 0.0020357484606504986, + "loss": 2.7953, + "mean_token_accuracy": 0.45162755250930786, + "num_tokens": 9192523417.0, + "step": 17982 + }, + { + "epoch": 4.862898864250947, + "grad_norm": 3.71875, + "learning_rate": 0.0020356082257750627, + "loss": 2.6707, + "mean_token_accuracy": 0.4356369376182556, + "num_tokens": 9193047647.0, + "step": 17983 + }, + { + "epoch": 4.863169280692266, + "grad_norm": 19.875, + "learning_rate": 0.0020354682659535273, + "loss": 2.5312, + "mean_token_accuracy": 0.4790313243865967, + "num_tokens": 9193571919.0, + "step": 17984 + }, + { + "epoch": 4.863439697133586, + "grad_norm": 6.375, + "learning_rate": 0.00203532858119019, + "loss": 2.7142, + "mean_token_accuracy": 0.4549499750137329, + "num_tokens": 9194096184.0, + "step": 17985 + }, + { + "epoch": 4.863710113574905, + "grad_norm": 3.453125, + "learning_rate": 0.0020351891714893338, + "loss": 2.5536, + "mean_token_accuracy": 0.4580245018005371, + "num_tokens": 9194620446.0, + "step": 17986 + }, + { + "epoch": 4.863980530016225, + "grad_norm": 3.4375, + "learning_rate": 0.002035050036855239, + "loss": 2.7758, + "mean_token_accuracy": 0.448375403881073, + "num_tokens": 9195144624.0, + "step": 17987 + }, + { + "epoch": 4.8642509464575445, + "grad_norm": 3.3125, + "learning_rate": 0.0020349111772921718, + "loss": 2.8066, + "mean_token_accuracy": 0.45761895179748535, + "num_tokens": 9195668794.0, + "step": 17988 + }, + { + "epoch": 4.864521362898865, + "grad_norm": 3.546875, + "learning_rate": 0.002034772592804395, + "loss": 2.7885, + "mean_token_accuracy": 0.4396663010120392, + "num_tokens": 9196193015.0, + "step": 17989 + }, + { + "epoch": 4.864791779340184, + "grad_norm": 3.609375, + "learning_rate": 0.0020346342833961603, + "loss": 2.8346, + "mean_token_accuracy": 0.42392420768737793, + "num_tokens": 9196717226.0, + "step": 17990 + }, + { + "epoch": 4.865062195781504, + "grad_norm": 16.75, + "learning_rate": 0.0020344962490717105, + "loss": 2.0134, + "mean_token_accuracy": 0.5148211717605591, + "num_tokens": 9197228777.0, + "step": 17991 + }, + { + "epoch": 4.865332612222823, + "grad_norm": 6.84375, + "learning_rate": 0.002034358489835282, + "loss": 2.8122, + "mean_token_accuracy": 0.44138726592063904, + "num_tokens": 9197752930.0, + "step": 17992 + }, + { + "epoch": 4.865603028664143, + "grad_norm": 4.71875, + "learning_rate": 0.0020342210056911022, + "loss": 2.5356, + "mean_token_accuracy": 0.5084563493728638, + "num_tokens": 9198242495.0, + "step": 17993 + }, + { + "epoch": 4.865873445105462, + "grad_norm": 4.5, + "learning_rate": 0.0020340837966433874, + "loss": 2.8382, + "mean_token_accuracy": 0.4264678359031677, + "num_tokens": 9198717613.0, + "step": 17994 + }, + { + "epoch": 4.866143861546782, + "grad_norm": 4.84375, + "learning_rate": 0.002033946862696349, + "loss": 2.6549, + "mean_token_accuracy": 0.4437468945980072, + "num_tokens": 9199212547.0, + "step": 17995 + }, + { + "epoch": 4.866414277988102, + "grad_norm": 4.03125, + "learning_rate": 0.0020338102038541883, + "loss": 2.8545, + "mean_token_accuracy": 0.43779823184013367, + "num_tokens": 9199736694.0, + "step": 17996 + }, + { + "epoch": 4.866684694429422, + "grad_norm": 4.5625, + "learning_rate": 0.0020336738201210984, + "loss": 2.5759, + "mean_token_accuracy": 0.43962323665618896, + "num_tokens": 9200260931.0, + "step": 17997 + }, + { + "epoch": 4.866955110870741, + "grad_norm": 4.84375, + "learning_rate": 0.0020335377115012648, + "loss": 2.6543, + "mean_token_accuracy": 0.43208515644073486, + "num_tokens": 9200785120.0, + "step": 17998 + }, + { + "epoch": 4.86722552731206, + "grad_norm": 4.09375, + "learning_rate": 0.0020334018779988618, + "loss": 2.7835, + "mean_token_accuracy": 0.4244810938835144, + "num_tokens": 9201309296.0, + "step": 17999 + }, + { + "epoch": 4.86749594375338, + "grad_norm": 4.03125, + "learning_rate": 0.0020332663196180583, + "loss": 2.9036, + "mean_token_accuracy": 0.43671754002571106, + "num_tokens": 9201833471.0, + "step": 18000 + }, + { + "epoch": 4.8677663601947, + "grad_norm": 4.25, + "learning_rate": 0.002033131036363014, + "loss": 2.802, + "mean_token_accuracy": 0.4482150673866272, + "num_tokens": 9202357669.0, + "step": 18001 + }, + { + "epoch": 4.8680367766360195, + "grad_norm": 3.90625, + "learning_rate": 0.0020329960282378786, + "loss": 2.7977, + "mean_token_accuracy": 0.44140031933784485, + "num_tokens": 9202881898.0, + "step": 18002 + }, + { + "epoch": 4.868307193077339, + "grad_norm": 4.1875, + "learning_rate": 0.0020328612952467966, + "loss": 2.9064, + "mean_token_accuracy": 0.43503713607788086, + "num_tokens": 9203406121.0, + "step": 18003 + }, + { + "epoch": 4.868577609518659, + "grad_norm": 4.34375, + "learning_rate": 0.0020327268373938985, + "loss": 2.9052, + "mean_token_accuracy": 0.42312270402908325, + "num_tokens": 9203930402.0, + "step": 18004 + }, + { + "epoch": 4.868848025959978, + "grad_norm": 3.96875, + "learning_rate": 0.002032592654683314, + "loss": 2.9377, + "mean_token_accuracy": 0.43149733543395996, + "num_tokens": 9204454686.0, + "step": 18005 + }, + { + "epoch": 4.869118442401298, + "grad_norm": 3.875, + "learning_rate": 0.002032458747119158, + "loss": 2.7799, + "mean_token_accuracy": 0.4394579827785492, + "num_tokens": 9204978851.0, + "step": 18006 + }, + { + "epoch": 4.869388858842617, + "grad_norm": 3.890625, + "learning_rate": 0.002032325114705539, + "loss": 2.7761, + "mean_token_accuracy": 0.44595953822135925, + "num_tokens": 9205503115.0, + "step": 18007 + }, + { + "epoch": 4.869659275283937, + "grad_norm": 4.65625, + "learning_rate": 0.0020321917574465586, + "loss": 2.8561, + "mean_token_accuracy": 0.45396488904953003, + "num_tokens": 9206019891.0, + "step": 18008 + }, + { + "epoch": 4.869929691725257, + "grad_norm": 4.28125, + "learning_rate": 0.0020320586753463064, + "loss": 2.832, + "mean_token_accuracy": 0.4235568046569824, + "num_tokens": 9206543940.0, + "step": 18009 + }, + { + "epoch": 4.870200108166577, + "grad_norm": 3.53125, + "learning_rate": 0.002031925868408868, + "loss": 2.6271, + "mean_token_accuracy": 0.5106820464134216, + "num_tokens": 9206990690.0, + "step": 18010 + }, + { + "epoch": 4.870470524607896, + "grad_norm": 11.875, + "learning_rate": 0.0020317933366383175, + "loss": 2.2136, + "mean_token_accuracy": 0.5289627313613892, + "num_tokens": 9207514914.0, + "step": 18011 + }, + { + "epoch": 4.870740941049216, + "grad_norm": 7.96875, + "learning_rate": 0.00203166108003872, + "loss": 2.8179, + "mean_token_accuracy": 0.4374893605709076, + "num_tokens": 9208039054.0, + "step": 18012 + }, + { + "epoch": 4.871011357490535, + "grad_norm": 2.640625, + "learning_rate": 0.002031529098614136, + "loss": 2.8911, + "mean_token_accuracy": 0.4222189486026764, + "num_tokens": 9208563286.0, + "step": 18013 + }, + { + "epoch": 4.871281773931855, + "grad_norm": 4.0, + "learning_rate": 0.002031397392368613, + "loss": 2.7091, + "mean_token_accuracy": 0.4400748908519745, + "num_tokens": 9209087522.0, + "step": 18014 + }, + { + "epoch": 4.871552190373174, + "grad_norm": 3.390625, + "learning_rate": 0.002031265961306193, + "loss": 2.7464, + "mean_token_accuracy": 0.4268043041229248, + "num_tokens": 9209611747.0, + "step": 18015 + }, + { + "epoch": 4.8718226068144945, + "grad_norm": 4.21875, + "learning_rate": 0.0020311348054309086, + "loss": 2.6819, + "mean_token_accuracy": 0.4522811472415924, + "num_tokens": 9210135997.0, + "step": 18016 + }, + { + "epoch": 4.872093023255814, + "grad_norm": 3.609375, + "learning_rate": 0.002031003924746784, + "loss": 2.6906, + "mean_token_accuracy": 0.430014431476593, + "num_tokens": 9210622338.0, + "step": 18017 + }, + { + "epoch": 4.872363439697134, + "grad_norm": 3.359375, + "learning_rate": 0.0020308733192578353, + "loss": 2.7925, + "mean_token_accuracy": 0.45034855604171753, + "num_tokens": 9211146495.0, + "step": 18018 + }, + { + "epoch": 4.872633856138453, + "grad_norm": 3.796875, + "learning_rate": 0.0020307429889680696, + "loss": 2.6826, + "mean_token_accuracy": 0.4714510440826416, + "num_tokens": 9211608405.0, + "step": 18019 + }, + { + "epoch": 4.872904272579773, + "grad_norm": 3.65625, + "learning_rate": 0.0020306129338814867, + "loss": 2.8452, + "mean_token_accuracy": 0.4494164288043976, + "num_tokens": 9212077054.0, + "step": 18020 + }, + { + "epoch": 4.873174689021092, + "grad_norm": 3.9375, + "learning_rate": 0.0020304831540020754, + "loss": 2.7671, + "mean_token_accuracy": 0.4458118677139282, + "num_tokens": 9212595726.0, + "step": 18021 + }, + { + "epoch": 4.873445105462412, + "grad_norm": 7.625, + "learning_rate": 0.002030353649333818, + "loss": 2.5176, + "mean_token_accuracy": 0.513849139213562, + "num_tokens": 9213119844.0, + "step": 18022 + }, + { + "epoch": 4.873715521903732, + "grad_norm": 4.65625, + "learning_rate": 0.0020302244198806894, + "loss": 2.7128, + "mean_token_accuracy": 0.43127578496932983, + "num_tokens": 9213589476.0, + "step": 18023 + }, + { + "epoch": 4.873985938345052, + "grad_norm": 5.1875, + "learning_rate": 0.002030095465646655, + "loss": 2.7482, + "mean_token_accuracy": 0.4676687717437744, + "num_tokens": 9214113696.0, + "step": 18024 + }, + { + "epoch": 4.874256354786371, + "grad_norm": 3.484375, + "learning_rate": 0.002029966786635669, + "loss": 2.7999, + "mean_token_accuracy": 0.419072687625885, + "num_tokens": 9214637958.0, + "step": 18025 + }, + { + "epoch": 4.874526771227691, + "grad_norm": 3.53125, + "learning_rate": 0.0020298383828516813, + "loss": 2.6586, + "mean_token_accuracy": 0.45790237188339233, + "num_tokens": 9215162237.0, + "step": 18026 + }, + { + "epoch": 4.87479718766901, + "grad_norm": 4.1875, + "learning_rate": 0.002029710254298632, + "loss": 2.7674, + "mean_token_accuracy": 0.4338538348674774, + "num_tokens": 9215686434.0, + "step": 18027 + }, + { + "epoch": 4.87506760411033, + "grad_norm": 3.796875, + "learning_rate": 0.0020295824009804514, + "loss": 2.6571, + "mean_token_accuracy": 0.45087915658950806, + "num_tokens": 9216194526.0, + "step": 18028 + }, + { + "epoch": 4.875338020551649, + "grad_norm": 3.640625, + "learning_rate": 0.0020294548229010637, + "loss": 2.9363, + "mean_token_accuracy": 0.4288319945335388, + "num_tokens": 9216718794.0, + "step": 18029 + }, + { + "epoch": 4.8756084369929695, + "grad_norm": 3.890625, + "learning_rate": 0.0020293275200643827, + "loss": 2.7305, + "mean_token_accuracy": 0.4364883601665497, + "num_tokens": 9217242967.0, + "step": 18030 + }, + { + "epoch": 4.875878853434289, + "grad_norm": 33.75, + "learning_rate": 0.002029200492474314, + "loss": 2.5678, + "mean_token_accuracy": 0.4937555193901062, + "num_tokens": 9217767145.0, + "step": 18031 + }, + { + "epoch": 4.876149269875609, + "grad_norm": 6.25, + "learning_rate": 0.0020290737401347557, + "loss": 2.8418, + "mean_token_accuracy": 0.4412957727909088, + "num_tokens": 9218256498.0, + "step": 18032 + }, + { + "epoch": 4.876419686316928, + "grad_norm": 2.9375, + "learning_rate": 0.0020289472630495966, + "loss": 2.5869, + "mean_token_accuracy": 0.46048736572265625, + "num_tokens": 9218775295.0, + "step": 18033 + }, + { + "epoch": 4.876690102758248, + "grad_norm": 3.796875, + "learning_rate": 0.002028821061222718, + "loss": 2.7363, + "mean_token_accuracy": 0.4526888132095337, + "num_tokens": 9219243009.0, + "step": 18034 + }, + { + "epoch": 4.876960519199567, + "grad_norm": 3.359375, + "learning_rate": 0.0020286951346579923, + "loss": 2.8004, + "mean_token_accuracy": 0.42321521043777466, + "num_tokens": 9219767279.0, + "step": 18035 + }, + { + "epoch": 4.8772309356408865, + "grad_norm": 4.09375, + "learning_rate": 0.0020285694833592815, + "loss": 2.6735, + "mean_token_accuracy": 0.47347187995910645, + "num_tokens": 9220248743.0, + "step": 18036 + }, + { + "epoch": 4.877501352082207, + "grad_norm": 4.40625, + "learning_rate": 0.0020284441073304426, + "loss": 2.7153, + "mean_token_accuracy": 0.4649762511253357, + "num_tokens": 9220753284.0, + "step": 18037 + }, + { + "epoch": 4.877771768523527, + "grad_norm": 3.4375, + "learning_rate": 0.0020283190065753225, + "loss": 2.7537, + "mean_token_accuracy": 0.44759130477905273, + "num_tokens": 9221276398.0, + "step": 18038 + }, + { + "epoch": 4.878042184964846, + "grad_norm": 3.875, + "learning_rate": 0.002028194181097759, + "loss": 2.882, + "mean_token_accuracy": 0.44246503710746765, + "num_tokens": 9221736009.0, + "step": 18039 + }, + { + "epoch": 4.878312601406165, + "grad_norm": 3.921875, + "learning_rate": 0.0020280696309015822, + "loss": 2.7536, + "mean_token_accuracy": 0.4465075731277466, + "num_tokens": 9222260218.0, + "step": 18040 + }, + { + "epoch": 4.878583017847485, + "grad_norm": 3.671875, + "learning_rate": 0.0020279453559906143, + "loss": 2.7592, + "mean_token_accuracy": 0.4595399498939514, + "num_tokens": 9222784383.0, + "step": 18041 + }, + { + "epoch": 4.878853434288805, + "grad_norm": 4.03125, + "learning_rate": 0.002027821356368668, + "loss": 2.8472, + "mean_token_accuracy": 0.42207005620002747, + "num_tokens": 9223308624.0, + "step": 18042 + }, + { + "epoch": 4.8791238507301244, + "grad_norm": 3.8125, + "learning_rate": 0.002027697632039548, + "loss": 2.8532, + "mean_token_accuracy": 0.4452570378780365, + "num_tokens": 9223832854.0, + "step": 18043 + }, + { + "epoch": 4.879394267171444, + "grad_norm": 4.1875, + "learning_rate": 0.0020275741830070495, + "loss": 2.6263, + "mean_token_accuracy": 0.4547255039215088, + "num_tokens": 9224357125.0, + "step": 18044 + }, + { + "epoch": 4.879664683612764, + "grad_norm": 43.25, + "learning_rate": 0.002027451009274962, + "loss": 2.6079, + "mean_token_accuracy": 0.4865809679031372, + "num_tokens": 9224814503.0, + "step": 18045 + }, + { + "epoch": 4.879935100054083, + "grad_norm": 4.875, + "learning_rate": 0.0020273281108470643, + "loss": 2.8944, + "mean_token_accuracy": 0.4459843635559082, + "num_tokens": 9225315477.0, + "step": 18046 + }, + { + "epoch": 4.880205516495403, + "grad_norm": 3.75, + "learning_rate": 0.0020272054877271263, + "loss": 2.7893, + "mean_token_accuracy": 0.44800522923469543, + "num_tokens": 9225838958.0, + "step": 18047 + }, + { + "epoch": 4.880475932936722, + "grad_norm": 4.53125, + "learning_rate": 0.0020270831399189114, + "loss": 2.7673, + "mean_token_accuracy": 0.4585651755332947, + "num_tokens": 9226363045.0, + "step": 18048 + }, + { + "epoch": 4.880746349378042, + "grad_norm": 4.59375, + "learning_rate": 0.0020269610674261743, + "loss": 2.5368, + "mean_token_accuracy": 0.45445218682289124, + "num_tokens": 9226887280.0, + "step": 18049 + }, + { + "epoch": 4.8810167658193615, + "grad_norm": 3.375, + "learning_rate": 0.0020268392702526587, + "loss": 2.9214, + "mean_token_accuracy": 0.43229615688323975, + "num_tokens": 9227404333.0, + "step": 18050 + }, + { + "epoch": 4.881287182260682, + "grad_norm": 10.1875, + "learning_rate": 0.0020267177484021032, + "loss": 1.7631, + "mean_token_accuracy": 0.5855770111083984, + "num_tokens": 9227928543.0, + "step": 18051 + }, + { + "epoch": 4.881557598702001, + "grad_norm": 7.21875, + "learning_rate": 0.0020265965018782363, + "loss": 2.641, + "mean_token_accuracy": 0.4461536109447479, + "num_tokens": 9228452763.0, + "step": 18052 + }, + { + "epoch": 4.881828015143321, + "grad_norm": 3.0625, + "learning_rate": 0.0020264755306847767, + "loss": 2.8113, + "mean_token_accuracy": 0.4433524012565613, + "num_tokens": 9228977015.0, + "step": 18053 + }, + { + "epoch": 4.88209843158464, + "grad_norm": 3.78125, + "learning_rate": 0.002026354834825438, + "loss": 2.686, + "mean_token_accuracy": 0.458859384059906, + "num_tokens": 9229474255.0, + "step": 18054 + }, + { + "epoch": 4.88236884802596, + "grad_norm": 3.734375, + "learning_rate": 0.0020262344143039224, + "loss": 2.7756, + "mean_token_accuracy": 0.4337003529071808, + "num_tokens": 9229938387.0, + "step": 18055 + }, + { + "epoch": 4.882639264467279, + "grad_norm": 3.5625, + "learning_rate": 0.002026114269123925, + "loss": 2.6727, + "mean_token_accuracy": 0.46280786395072937, + "num_tokens": 9230462532.0, + "step": 18056 + }, + { + "epoch": 4.8829096809085994, + "grad_norm": 3.796875, + "learning_rate": 0.0020259943992891323, + "loss": 2.9593, + "mean_token_accuracy": 0.437275767326355, + "num_tokens": 9230934062.0, + "step": 18057 + }, + { + "epoch": 4.883180097349919, + "grad_norm": 4.53125, + "learning_rate": 0.0020258748048032223, + "loss": 2.7336, + "mean_token_accuracy": 0.43726229667663574, + "num_tokens": 9231458343.0, + "step": 18058 + }, + { + "epoch": 4.883450513791239, + "grad_norm": 3.421875, + "learning_rate": 0.0020257554856698643, + "loss": 2.7482, + "mean_token_accuracy": 0.44028276205062866, + "num_tokens": 9231982433.0, + "step": 18059 + }, + { + "epoch": 4.883720930232558, + "grad_norm": 3.796875, + "learning_rate": 0.0020256364418927203, + "loss": 2.9181, + "mean_token_accuracy": 0.4368317723274231, + "num_tokens": 9232452716.0, + "step": 18060 + }, + { + "epoch": 4.883991346673878, + "grad_norm": 4.375, + "learning_rate": 0.0020255176734754418, + "loss": 2.8623, + "mean_token_accuracy": 0.44560983777046204, + "num_tokens": 9232976980.0, + "step": 18061 + }, + { + "epoch": 4.884261763115197, + "grad_norm": 4.25, + "learning_rate": 0.0020253991804216726, + "loss": 2.6857, + "mean_token_accuracy": 0.44491684436798096, + "num_tokens": 9233501184.0, + "step": 18062 + }, + { + "epoch": 4.884532179556517, + "grad_norm": 3.546875, + "learning_rate": 0.0020252809627350496, + "loss": 2.731, + "mean_token_accuracy": 0.4529632031917572, + "num_tokens": 9234001819.0, + "step": 18063 + }, + { + "epoch": 4.8848025959978365, + "grad_norm": 3.796875, + "learning_rate": 0.0020251630204191997, + "loss": 2.8968, + "mean_token_accuracy": 0.43925225734710693, + "num_tokens": 9234451725.0, + "step": 18064 + }, + { + "epoch": 4.885073012439157, + "grad_norm": 4.0625, + "learning_rate": 0.0020250453534777415, + "loss": 2.8328, + "mean_token_accuracy": 0.43430954217910767, + "num_tokens": 9234975999.0, + "step": 18065 + }, + { + "epoch": 4.885343428880476, + "grad_norm": 4.375, + "learning_rate": 0.0020249279619142854, + "loss": 2.8776, + "mean_token_accuracy": 0.4254158139228821, + "num_tokens": 9235447943.0, + "step": 18066 + }, + { + "epoch": 4.885613845321796, + "grad_norm": 3.59375, + "learning_rate": 0.002024810845732434, + "loss": 2.6285, + "mean_token_accuracy": 0.46113288402557373, + "num_tokens": 9235972171.0, + "step": 18067 + }, + { + "epoch": 4.885884261763115, + "grad_norm": 3.59375, + "learning_rate": 0.002024694004935779, + "loss": 2.8039, + "mean_token_accuracy": 0.4154435098171234, + "num_tokens": 9236496415.0, + "step": 18068 + }, + { + "epoch": 4.886154678204435, + "grad_norm": 3.46875, + "learning_rate": 0.002024577439527908, + "loss": 2.7739, + "mean_token_accuracy": 0.44558247923851013, + "num_tokens": 9236986931.0, + "step": 18069 + }, + { + "epoch": 4.886425094645754, + "grad_norm": 3.859375, + "learning_rate": 0.002024461149512395, + "loss": 2.633, + "mean_token_accuracy": 0.4336557686328888, + "num_tokens": 9237511147.0, + "step": 18070 + }, + { + "epoch": 4.8866955110870745, + "grad_norm": 82.0, + "learning_rate": 0.0020243451348928095, + "loss": 4.8291, + "mean_token_accuracy": 0.27790307998657227, + "num_tokens": 9238035327.0, + "step": 18071 + }, + { + "epoch": 4.886965927528394, + "grad_norm": 7.4375, + "learning_rate": 0.002024229395672712, + "loss": 2.7735, + "mean_token_accuracy": 0.43877899646759033, + "num_tokens": 9238559588.0, + "step": 18072 + }, + { + "epoch": 4.887236343969714, + "grad_norm": 3.0, + "learning_rate": 0.0020241139318556514, + "loss": 2.4862, + "mean_token_accuracy": 0.45545655488967896, + "num_tokens": 9239083827.0, + "step": 18073 + }, + { + "epoch": 4.887506760411033, + "grad_norm": 3.875, + "learning_rate": 0.002023998743445172, + "loss": 2.8167, + "mean_token_accuracy": 0.4334348738193512, + "num_tokens": 9239606486.0, + "step": 18074 + }, + { + "epoch": 4.887777176852353, + "grad_norm": 4.71875, + "learning_rate": 0.002023883830444808, + "loss": 2.7261, + "mean_token_accuracy": 0.4624481797218323, + "num_tokens": 9240032102.0, + "step": 18075 + }, + { + "epoch": 4.888047593293672, + "grad_norm": 3.828125, + "learning_rate": 0.002023769192858085, + "loss": 2.6667, + "mean_token_accuracy": 0.45077067613601685, + "num_tokens": 9240556360.0, + "step": 18076 + }, + { + "epoch": 4.888318009734991, + "grad_norm": 3.953125, + "learning_rate": 0.0020236548306885203, + "loss": 2.7201, + "mean_token_accuracy": 0.44667571783065796, + "num_tokens": 9241037526.0, + "step": 18077 + }, + { + "epoch": 4.8885884261763115, + "grad_norm": 3.6875, + "learning_rate": 0.0020235407439396246, + "loss": 2.7251, + "mean_token_accuracy": 0.4621877372264862, + "num_tokens": 9241538405.0, + "step": 18078 + }, + { + "epoch": 4.888858842617632, + "grad_norm": 3.984375, + "learning_rate": 0.002023426932614895, + "loss": 2.6474, + "mean_token_accuracy": 0.46376198530197144, + "num_tokens": 9242062613.0, + "step": 18079 + }, + { + "epoch": 4.889129259058951, + "grad_norm": 3.9375, + "learning_rate": 0.0020233133967178267, + "loss": 2.5705, + "mean_token_accuracy": 0.47739776968955994, + "num_tokens": 9242586705.0, + "step": 18080 + }, + { + "epoch": 4.88939967550027, + "grad_norm": 3.09375, + "learning_rate": 0.002023200136251902, + "loss": 2.7272, + "mean_token_accuracy": 0.45946308970451355, + "num_tokens": 9243041703.0, + "step": 18081 + }, + { + "epoch": 4.88967009194159, + "grad_norm": 4.0, + "learning_rate": 0.002023087151220595, + "loss": 2.8358, + "mean_token_accuracy": 0.42568516731262207, + "num_tokens": 9243534084.0, + "step": 18082 + }, + { + "epoch": 4.88994050838291, + "grad_norm": 3.65625, + "learning_rate": 0.0020229744416273747, + "loss": 2.7327, + "mean_token_accuracy": 0.45808500051498413, + "num_tokens": 9244001117.0, + "step": 18083 + }, + { + "epoch": 4.890210924824229, + "grad_norm": 3.375, + "learning_rate": 0.0020228620074756987, + "loss": 2.7747, + "mean_token_accuracy": 0.44587913155555725, + "num_tokens": 9244525353.0, + "step": 18084 + }, + { + "epoch": 4.890481341265549, + "grad_norm": 4.6875, + "learning_rate": 0.0020227498487690146, + "loss": 2.6359, + "mean_token_accuracy": 0.5061346292495728, + "num_tokens": 9245049547.0, + "step": 18085 + }, + { + "epoch": 4.890751757706869, + "grad_norm": 3.046875, + "learning_rate": 0.0020226379655107667, + "loss": 2.7786, + "mean_token_accuracy": 0.43980488181114197, + "num_tokens": 9245573714.0, + "step": 18086 + }, + { + "epoch": 4.891022174148188, + "grad_norm": 4.09375, + "learning_rate": 0.0020225263577043867, + "loss": 2.9123, + "mean_token_accuracy": 0.42778146266937256, + "num_tokens": 9246093812.0, + "step": 18087 + }, + { + "epoch": 4.891292590589508, + "grad_norm": 3.859375, + "learning_rate": 0.0020224150253532985, + "loss": 2.7394, + "mean_token_accuracy": 0.4701462984085083, + "num_tokens": 9246580158.0, + "step": 18088 + }, + { + "epoch": 4.891563007030827, + "grad_norm": 2.953125, + "learning_rate": 0.002022303968460919, + "loss": 2.7712, + "mean_token_accuracy": 0.4486771523952484, + "num_tokens": 9247104296.0, + "step": 18089 + }, + { + "epoch": 4.891833423472147, + "grad_norm": 4.375, + "learning_rate": 0.002022193187030656, + "loss": 2.9384, + "mean_token_accuracy": 0.441442608833313, + "num_tokens": 9247569204.0, + "step": 18090 + }, + { + "epoch": 4.892103839913466, + "grad_norm": 38.25, + "learning_rate": 0.002022082681065907, + "loss": 3.3113, + "mean_token_accuracy": 0.42274945974349976, + "num_tokens": 9248083523.0, + "step": 18091 + }, + { + "epoch": 4.8923742563547865, + "grad_norm": 8.0625, + "learning_rate": 0.002021972450570065, + "loss": 2.7536, + "mean_token_accuracy": 0.4383748173713684, + "num_tokens": 9248607628.0, + "step": 18092 + }, + { + "epoch": 4.892644672796106, + "grad_norm": 2.421875, + "learning_rate": 0.0020218624955465098, + "loss": 2.9218, + "mean_token_accuracy": 0.44078758358955383, + "num_tokens": 9249131905.0, + "step": 18093 + }, + { + "epoch": 4.892915089237426, + "grad_norm": 4.15625, + "learning_rate": 0.002021752815998617, + "loss": 2.5721, + "mean_token_accuracy": 0.47108596563339233, + "num_tokens": 9249592221.0, + "step": 18094 + }, + { + "epoch": 4.893185505678745, + "grad_norm": 3.890625, + "learning_rate": 0.0020216434119297508, + "loss": 2.813, + "mean_token_accuracy": 0.43772855401039124, + "num_tokens": 9250095979.0, + "step": 18095 + }, + { + "epoch": 4.893455922120065, + "grad_norm": 3.453125, + "learning_rate": 0.0020215342833432674, + "loss": 2.7769, + "mean_token_accuracy": 0.4443517029285431, + "num_tokens": 9250620138.0, + "step": 18096 + }, + { + "epoch": 4.893726338561384, + "grad_norm": 4.1875, + "learning_rate": 0.0020214254302425173, + "loss": 2.7678, + "mean_token_accuracy": 0.4541891813278198, + "num_tokens": 9251144383.0, + "step": 18097 + }, + { + "epoch": 4.893996755002704, + "grad_norm": 4.59375, + "learning_rate": 0.0020213168526308388, + "loss": 2.5956, + "mean_token_accuracy": 0.4583430290222168, + "num_tokens": 9251636667.0, + "step": 18098 + }, + { + "epoch": 4.894267171444024, + "grad_norm": 3.34375, + "learning_rate": 0.002021208550511564, + "loss": 2.6662, + "mean_token_accuracy": 0.4668874442577362, + "num_tokens": 9252138971.0, + "step": 18099 + }, + { + "epoch": 4.894537587885344, + "grad_norm": 4.15625, + "learning_rate": 0.0020211005238880155, + "loss": 2.9303, + "mean_token_accuracy": 0.43361207842826843, + "num_tokens": 9252636634.0, + "step": 18100 + }, + { + "epoch": 4.894808004326663, + "grad_norm": 3.96875, + "learning_rate": 0.0020209927727635087, + "loss": 2.7659, + "mean_token_accuracy": 0.43527594208717346, + "num_tokens": 9253160824.0, + "step": 18101 + }, + { + "epoch": 4.895078420767983, + "grad_norm": 3.984375, + "learning_rate": 0.0020208852971413485, + "loss": 2.8425, + "mean_token_accuracy": 0.42602601647377014, + "num_tokens": 9253685016.0, + "step": 18102 + }, + { + "epoch": 4.895348837209302, + "grad_norm": 4.34375, + "learning_rate": 0.002020778097024834, + "loss": 2.6446, + "mean_token_accuracy": 0.46086636185646057, + "num_tokens": 9254209294.0, + "step": 18103 + }, + { + "epoch": 4.895619253650622, + "grad_norm": 3.84375, + "learning_rate": 0.002020671172417253, + "loss": 2.7327, + "mean_token_accuracy": 0.44523024559020996, + "num_tokens": 9254698797.0, + "step": 18104 + }, + { + "epoch": 4.895889670091941, + "grad_norm": 3.640625, + "learning_rate": 0.002020564523321887, + "loss": 2.747, + "mean_token_accuracy": 0.4603480100631714, + "num_tokens": 9255165183.0, + "step": 18105 + }, + { + "epoch": 4.8961600865332615, + "grad_norm": 4.21875, + "learning_rate": 0.0020204581497420084, + "loss": 2.7805, + "mean_token_accuracy": 0.4277515411376953, + "num_tokens": 9255689463.0, + "step": 18106 + }, + { + "epoch": 4.896430502974581, + "grad_norm": 3.9375, + "learning_rate": 0.002020352051680882, + "loss": 2.6513, + "mean_token_accuracy": 0.449128121137619, + "num_tokens": 9256213646.0, + "step": 18107 + }, + { + "epoch": 4.896700919415901, + "grad_norm": 3.84375, + "learning_rate": 0.00202024622914176, + "loss": 2.6919, + "mean_token_accuracy": 0.44449949264526367, + "num_tokens": 9256737834.0, + "step": 18108 + }, + { + "epoch": 4.89697133585722, + "grad_norm": 3.578125, + "learning_rate": 0.002020140682127892, + "loss": 2.8432, + "mean_token_accuracy": 0.4398980736732483, + "num_tokens": 9257230575.0, + "step": 18109 + }, + { + "epoch": 4.89724175229854, + "grad_norm": 3.828125, + "learning_rate": 0.0020200354106425164, + "loss": 2.8718, + "mean_token_accuracy": 0.4378811717033386, + "num_tokens": 9257754788.0, + "step": 18110 + }, + { + "epoch": 4.897512168739859, + "grad_norm": 20.25, + "learning_rate": 0.002019930414688863, + "loss": 2.5417, + "mean_token_accuracy": 0.4849669337272644, + "num_tokens": 9258231054.0, + "step": 18111 + }, + { + "epoch": 4.897782585181179, + "grad_norm": 7.71875, + "learning_rate": 0.0020198256942701525, + "loss": 2.9185, + "mean_token_accuracy": 0.43463075160980225, + "num_tokens": 9258755280.0, + "step": 18112 + }, + { + "epoch": 4.898053001622499, + "grad_norm": 2.5625, + "learning_rate": 0.002019721249389599, + "loss": 2.687, + "mean_token_accuracy": 0.4785100221633911, + "num_tokens": 9259218090.0, + "step": 18113 + }, + { + "epoch": 4.898323418063819, + "grad_norm": 4.3125, + "learning_rate": 0.0020196170800504064, + "loss": 2.8543, + "mean_token_accuracy": 0.43871140480041504, + "num_tokens": 9259742232.0, + "step": 18114 + }, + { + "epoch": 4.898593834505138, + "grad_norm": 4.1875, + "learning_rate": 0.002019513186255772, + "loss": 2.8038, + "mean_token_accuracy": 0.44495660066604614, + "num_tokens": 9260220870.0, + "step": 18115 + }, + { + "epoch": 4.898864250946458, + "grad_norm": 4.09375, + "learning_rate": 0.0020194095680088823, + "loss": 2.7411, + "mean_token_accuracy": 0.4516461491584778, + "num_tokens": 9260745144.0, + "step": 18116 + }, + { + "epoch": 4.899134667387777, + "grad_norm": 4.375, + "learning_rate": 0.0020193062253129187, + "loss": 2.7905, + "mean_token_accuracy": 0.45228105783462524, + "num_tokens": 9261269324.0, + "step": 18117 + }, + { + "epoch": 4.899405083829096, + "grad_norm": 4.0, + "learning_rate": 0.0020192031581710494, + "loss": 2.6321, + "mean_token_accuracy": 0.4479823708534241, + "num_tokens": 9261793429.0, + "step": 18118 + }, + { + "epoch": 4.899675500270416, + "grad_norm": 3.578125, + "learning_rate": 0.0020191003665864377, + "loss": 2.6427, + "mean_token_accuracy": 0.46169912815093994, + "num_tokens": 9262317705.0, + "step": 18119 + }, + { + "epoch": 4.8999459167117365, + "grad_norm": 4.03125, + "learning_rate": 0.002018997850562239, + "loss": 2.9537, + "mean_token_accuracy": 0.43763846158981323, + "num_tokens": 9262841962.0, + "step": 18120 + }, + { + "epoch": 4.900216333153056, + "grad_norm": 3.953125, + "learning_rate": 0.002018895610101596, + "loss": 2.6269, + "mean_token_accuracy": 0.4460166096687317, + "num_tokens": 9263366203.0, + "step": 18121 + }, + { + "epoch": 4.900486749594375, + "grad_norm": 3.875, + "learning_rate": 0.002018793645207649, + "loss": 2.6539, + "mean_token_accuracy": 0.44827479124069214, + "num_tokens": 9263890463.0, + "step": 18122 + }, + { + "epoch": 4.900757166035695, + "grad_norm": 3.3125, + "learning_rate": 0.0020186919558835247, + "loss": 2.8858, + "mean_token_accuracy": 0.4425441026687622, + "num_tokens": 9264400431.0, + "step": 18123 + }, + { + "epoch": 4.901027582477015, + "grad_norm": 4.09375, + "learning_rate": 0.002018590542132343, + "loss": 2.6884, + "mean_token_accuracy": 0.44560855627059937, + "num_tokens": 9264924545.0, + "step": 18124 + }, + { + "epoch": 4.901297998918334, + "grad_norm": 4.25, + "learning_rate": 0.0020184894039572162, + "loss": 2.6552, + "mean_token_accuracy": 0.44492867588996887, + "num_tokens": 9265448542.0, + "step": 18125 + }, + { + "epoch": 4.9015684153596535, + "grad_norm": 4.59375, + "learning_rate": 0.002018388541361247, + "loss": 2.8369, + "mean_token_accuracy": 0.42991113662719727, + "num_tokens": 9265972742.0, + "step": 18126 + }, + { + "epoch": 4.901838831800974, + "grad_norm": 3.8125, + "learning_rate": 0.0020182879543475317, + "loss": 2.8762, + "mean_token_accuracy": 0.41281601786613464, + "num_tokens": 9266496925.0, + "step": 18127 + }, + { + "epoch": 4.902109248242293, + "grad_norm": 4.125, + "learning_rate": 0.002018187642919154, + "loss": 2.7398, + "mean_token_accuracy": 0.44567495584487915, + "num_tokens": 9267021102.0, + "step": 18128 + }, + { + "epoch": 4.902379664683613, + "grad_norm": 4.59375, + "learning_rate": 0.002018087607079194, + "loss": 2.7557, + "mean_token_accuracy": 0.4419978857040405, + "num_tokens": 9267545304.0, + "step": 18129 + }, + { + "epoch": 4.902650081124932, + "grad_norm": 4.15625, + "learning_rate": 0.0020179878468307203, + "loss": 2.6795, + "mean_token_accuracy": 0.4489022195339203, + "num_tokens": 9268069578.0, + "step": 18130 + }, + { + "epoch": 4.902920497566252, + "grad_norm": 18.875, + "learning_rate": 0.0020178883621767936, + "loss": 2.7109, + "mean_token_accuracy": 0.45055973529815674, + "num_tokens": 9268593734.0, + "step": 18131 + }, + { + "epoch": 4.903190914007571, + "grad_norm": 7.75, + "learning_rate": 0.002017789153120467, + "loss": 2.7337, + "mean_token_accuracy": 0.4586349427700043, + "num_tokens": 9269118010.0, + "step": 18132 + }, + { + "epoch": 4.903461330448891, + "grad_norm": 2.859375, + "learning_rate": 0.0020176902196647833, + "loss": 2.7354, + "mean_token_accuracy": 0.472247838973999, + "num_tokens": 9269573284.0, + "step": 18133 + }, + { + "epoch": 4.903731746890211, + "grad_norm": 3.15625, + "learning_rate": 0.0020175915618127795, + "loss": 2.7565, + "mean_token_accuracy": 0.45508813858032227, + "num_tokens": 9270093993.0, + "step": 18134 + }, + { + "epoch": 4.904002163331531, + "grad_norm": 4.25, + "learning_rate": 0.0020174931795674833, + "loss": 2.7865, + "mean_token_accuracy": 0.4394017159938812, + "num_tokens": 9270577272.0, + "step": 18135 + }, + { + "epoch": 4.90427257977285, + "grad_norm": 3.46875, + "learning_rate": 0.002017395072931911, + "loss": 2.7513, + "mean_token_accuracy": 0.42860543727874756, + "num_tokens": 9271101549.0, + "step": 18136 + }, + { + "epoch": 4.90454299621417, + "grad_norm": 4.125, + "learning_rate": 0.002017297241909074, + "loss": 2.9294, + "mean_token_accuracy": 0.43845534324645996, + "num_tokens": 9271625752.0, + "step": 18137 + }, + { + "epoch": 4.904813412655489, + "grad_norm": 4.0625, + "learning_rate": 0.0020171996865019732, + "loss": 2.8114, + "mean_token_accuracy": 0.445054292678833, + "num_tokens": 9272145559.0, + "step": 18138 + }, + { + "epoch": 4.905083829096809, + "grad_norm": 3.78125, + "learning_rate": 0.0020171024067136036, + "loss": 2.8024, + "mean_token_accuracy": 0.4528082609176636, + "num_tokens": 9272610992.0, + "step": 18139 + }, + { + "epoch": 4.9053542455381285, + "grad_norm": 4.1875, + "learning_rate": 0.0020170054025469494, + "loss": 2.6979, + "mean_token_accuracy": 0.4432407021522522, + "num_tokens": 9273135007.0, + "step": 18140 + }, + { + "epoch": 4.905624661979449, + "grad_norm": 3.5, + "learning_rate": 0.0020169086740049864, + "loss": 2.7852, + "mean_token_accuracy": 0.4480176568031311, + "num_tokens": 9273659290.0, + "step": 18141 + }, + { + "epoch": 4.905895078420768, + "grad_norm": 4.96875, + "learning_rate": 0.002016812221090683, + "loss": 2.6322, + "mean_token_accuracy": 0.44919252395629883, + "num_tokens": 9274183574.0, + "step": 18142 + }, + { + "epoch": 4.906165494862088, + "grad_norm": 3.359375, + "learning_rate": 0.002016716043806999, + "loss": 2.7361, + "mean_token_accuracy": 0.45459800958633423, + "num_tokens": 9274643062.0, + "step": 18143 + }, + { + "epoch": 4.906435911303407, + "grad_norm": 4.15625, + "learning_rate": 0.0020166201421568833, + "loss": 2.8822, + "mean_token_accuracy": 0.4307419955730438, + "num_tokens": 9275167242.0, + "step": 18144 + }, + { + "epoch": 4.906706327744727, + "grad_norm": 3.75, + "learning_rate": 0.002016524516143281, + "loss": 2.6054, + "mean_token_accuracy": 0.46255362033843994, + "num_tokens": 9275628241.0, + "step": 18145 + }, + { + "epoch": 4.906976744186046, + "grad_norm": 3.796875, + "learning_rate": 0.0020164291657691256, + "loss": 2.6554, + "mean_token_accuracy": 0.4505774974822998, + "num_tokens": 9276152434.0, + "step": 18146 + }, + { + "epoch": 4.907247160627366, + "grad_norm": 3.390625, + "learning_rate": 0.002016334091037341, + "loss": 2.869, + "mean_token_accuracy": 0.4538244903087616, + "num_tokens": 9276630842.0, + "step": 18147 + }, + { + "epoch": 4.907517577068686, + "grad_norm": 3.734375, + "learning_rate": 0.0020162392919508476, + "loss": 2.7532, + "mean_token_accuracy": 0.41571637988090515, + "num_tokens": 9277155076.0, + "step": 18148 + }, + { + "epoch": 4.907787993510006, + "grad_norm": 4.46875, + "learning_rate": 0.002016144768512551, + "loss": 2.7188, + "mean_token_accuracy": 0.4329715371131897, + "num_tokens": 9277679325.0, + "step": 18149 + }, + { + "epoch": 4.908058409951325, + "grad_norm": 3.859375, + "learning_rate": 0.002016050520725353, + "loss": 2.7424, + "mean_token_accuracy": 0.4686652719974518, + "num_tokens": 9278203457.0, + "step": 18150 + }, + { + "epoch": 4.908328826392645, + "grad_norm": 87.0, + "learning_rate": 0.0020159565485921454, + "loss": 3.9157, + "mean_token_accuracy": 0.3458139896392822, + "num_tokens": 9278722981.0, + "step": 18151 + }, + { + "epoch": 4.908599242833964, + "grad_norm": 8.5625, + "learning_rate": 0.002015862852115811, + "loss": 2.5614, + "mean_token_accuracy": 0.4644321799278259, + "num_tokens": 9279247227.0, + "step": 18152 + }, + { + "epoch": 4.908869659275284, + "grad_norm": 3.84375, + "learning_rate": 0.0020157694312992252, + "loss": 2.8621, + "mean_token_accuracy": 0.4356858730316162, + "num_tokens": 9279771428.0, + "step": 18153 + }, + { + "epoch": 4.9091400757166035, + "grad_norm": 5.09375, + "learning_rate": 0.0020156762861452534, + "loss": 2.8341, + "mean_token_accuracy": 0.442377507686615, + "num_tokens": 9280295643.0, + "step": 18154 + }, + { + "epoch": 4.909410492157924, + "grad_norm": 3.421875, + "learning_rate": 0.0020155834166567547, + "loss": 2.7883, + "mean_token_accuracy": 0.4408120810985565, + "num_tokens": 9280819908.0, + "step": 18155 + }, + { + "epoch": 4.909680908599243, + "grad_norm": 5.03125, + "learning_rate": 0.0020154908228365775, + "loss": 2.9176, + "mean_token_accuracy": 0.41312652826309204, + "num_tokens": 9281344103.0, + "step": 18156 + }, + { + "epoch": 4.909951325040563, + "grad_norm": 3.671875, + "learning_rate": 0.0020153985046875655, + "loss": 2.7467, + "mean_token_accuracy": 0.44415658712387085, + "num_tokens": 9281868114.0, + "step": 18157 + }, + { + "epoch": 4.910221741481882, + "grad_norm": 3.984375, + "learning_rate": 0.0020153064622125474, + "loss": 2.756, + "mean_token_accuracy": 0.4382314085960388, + "num_tokens": 9282388627.0, + "step": 18158 + }, + { + "epoch": 4.910492157923201, + "grad_norm": 5.40625, + "learning_rate": 0.0020152146954143502, + "loss": 2.6181, + "mean_token_accuracy": 0.4875662922859192, + "num_tokens": 9282851125.0, + "step": 18159 + }, + { + "epoch": 4.910762574364521, + "grad_norm": 3.03125, + "learning_rate": 0.002015123204295788, + "loss": 2.701, + "mean_token_accuracy": 0.4621948003768921, + "num_tokens": 9283344873.0, + "step": 18160 + }, + { + "epoch": 4.911032990805841, + "grad_norm": 3.84375, + "learning_rate": 0.0020150319888596695, + "loss": 2.6516, + "mean_token_accuracy": 0.44552209973335266, + "num_tokens": 9283869030.0, + "step": 18161 + }, + { + "epoch": 4.911303407247161, + "grad_norm": 3.859375, + "learning_rate": 0.002014941049108793, + "loss": 2.7596, + "mean_token_accuracy": 0.44510823488235474, + "num_tokens": 9284393301.0, + "step": 18162 + }, + { + "epoch": 4.91157382368848, + "grad_norm": 4.28125, + "learning_rate": 0.0020148503850459477, + "loss": 2.8237, + "mean_token_accuracy": 0.4752455949783325, + "num_tokens": 9284854075.0, + "step": 18163 + }, + { + "epoch": 4.9118442401298, + "grad_norm": 3.640625, + "learning_rate": 0.0020147599966739167, + "loss": 2.7034, + "mean_token_accuracy": 0.46613991260528564, + "num_tokens": 9285309049.0, + "step": 18164 + }, + { + "epoch": 4.91211465657112, + "grad_norm": 4.03125, + "learning_rate": 0.0020146698839954725, + "loss": 2.7553, + "mean_token_accuracy": 0.4371750056743622, + "num_tokens": 9285833311.0, + "step": 18165 + }, + { + "epoch": 4.912385073012439, + "grad_norm": 4.59375, + "learning_rate": 0.00201458004701338, + "loss": 2.7107, + "mean_token_accuracy": 0.42257750034332275, + "num_tokens": 9286346882.0, + "step": 18166 + }, + { + "epoch": 4.912655489453758, + "grad_norm": 4.03125, + "learning_rate": 0.0020144904857303967, + "loss": 2.6381, + "mean_token_accuracy": 0.4663965106010437, + "num_tokens": 9286826731.0, + "step": 18167 + }, + { + "epoch": 4.9129259058950785, + "grad_norm": 4.21875, + "learning_rate": 0.0020144012001492694, + "loss": 2.7953, + "mean_token_accuracy": 0.4484429359436035, + "num_tokens": 9287302759.0, + "step": 18168 + }, + { + "epoch": 4.913196322336398, + "grad_norm": 3.640625, + "learning_rate": 0.002014312190272739, + "loss": 2.7915, + "mean_token_accuracy": 0.43977993726730347, + "num_tokens": 9287827005.0, + "step": 18169 + }, + { + "epoch": 4.913466738777718, + "grad_norm": 4.28125, + "learning_rate": 0.0020142234561035346, + "loss": 2.6185, + "mean_token_accuracy": 0.465702086687088, + "num_tokens": 9288351287.0, + "step": 18170 + }, + { + "epoch": 4.913737155219037, + "grad_norm": 26.875, + "learning_rate": 0.002014134997644381, + "loss": 3.257, + "mean_token_accuracy": 0.40402552485466003, + "num_tokens": 9288827569.0, + "step": 18171 + }, + { + "epoch": 4.914007571660357, + "grad_norm": 7.59375, + "learning_rate": 0.0020140468148979913, + "loss": 2.6502, + "mean_token_accuracy": 0.44019609689712524, + "num_tokens": 9289351846.0, + "step": 18172 + }, + { + "epoch": 4.914277988101676, + "grad_norm": 3.0625, + "learning_rate": 0.0020139589078670716, + "loss": 2.7386, + "mean_token_accuracy": 0.44920438528060913, + "num_tokens": 9289871904.0, + "step": 18173 + }, + { + "epoch": 4.914548404542996, + "grad_norm": 3.96875, + "learning_rate": 0.002013871276554317, + "loss": 2.6623, + "mean_token_accuracy": 0.4448675513267517, + "num_tokens": 9290396166.0, + "step": 18174 + }, + { + "epoch": 4.9148188209843156, + "grad_norm": 3.765625, + "learning_rate": 0.0020137839209624204, + "loss": 2.8923, + "mean_token_accuracy": 0.43567925691604614, + "num_tokens": 9290920322.0, + "step": 18175 + }, + { + "epoch": 4.915089237425636, + "grad_norm": 4.1875, + "learning_rate": 0.002013696841094059, + "loss": 2.8755, + "mean_token_accuracy": 0.42547842860221863, + "num_tokens": 9291444591.0, + "step": 18176 + }, + { + "epoch": 4.915359653866955, + "grad_norm": 3.96875, + "learning_rate": 0.002013610036951905, + "loss": 2.5177, + "mean_token_accuracy": 0.456297367811203, + "num_tokens": 9291922187.0, + "step": 18177 + }, + { + "epoch": 4.915630070308275, + "grad_norm": 3.859375, + "learning_rate": 0.0020135235085386232, + "loss": 2.835, + "mean_token_accuracy": 0.4258824586868286, + "num_tokens": 9292446401.0, + "step": 18178 + }, + { + "epoch": 4.915900486749594, + "grad_norm": 4.5, + "learning_rate": 0.0020134372558568664, + "loss": 2.7245, + "mean_token_accuracy": 0.458065927028656, + "num_tokens": 9292899392.0, + "step": 18179 + }, + { + "epoch": 4.916170903190914, + "grad_norm": 4.25, + "learning_rate": 0.002013351278909284, + "loss": 2.9348, + "mean_token_accuracy": 0.43673163652420044, + "num_tokens": 9293423664.0, + "step": 18180 + }, + { + "epoch": 4.916441319632233, + "grad_norm": 4.125, + "learning_rate": 0.002013265577698511, + "loss": 2.7876, + "mean_token_accuracy": 0.4434845745563507, + "num_tokens": 9293947900.0, + "step": 18181 + }, + { + "epoch": 4.9167117360735535, + "grad_norm": 3.703125, + "learning_rate": 0.0020131801522271795, + "loss": 2.7202, + "mean_token_accuracy": 0.4467165768146515, + "num_tokens": 9294472082.0, + "step": 18182 + }, + { + "epoch": 4.916982152514873, + "grad_norm": 3.515625, + "learning_rate": 0.002013095002497909, + "loss": 2.8599, + "mean_token_accuracy": 0.4271681606769562, + "num_tokens": 9294996220.0, + "step": 18183 + }, + { + "epoch": 4.917252568956193, + "grad_norm": 3.828125, + "learning_rate": 0.0020130101285133124, + "loss": 2.6523, + "mean_token_accuracy": 0.43887993693351746, + "num_tokens": 9295520430.0, + "step": 18184 + }, + { + "epoch": 4.917522985397512, + "grad_norm": 4.5625, + "learning_rate": 0.002012925530275994, + "loss": 2.813, + "mean_token_accuracy": 0.4387793242931366, + "num_tokens": 9295969867.0, + "step": 18185 + }, + { + "epoch": 4.917793401838832, + "grad_norm": 3.65625, + "learning_rate": 0.0020128412077885494, + "loss": 2.7749, + "mean_token_accuracy": 0.4274876117706299, + "num_tokens": 9296493983.0, + "step": 18186 + }, + { + "epoch": 4.918063818280151, + "grad_norm": 4.125, + "learning_rate": 0.002012757161053566, + "loss": 2.8085, + "mean_token_accuracy": 0.4501684308052063, + "num_tokens": 9296989738.0, + "step": 18187 + }, + { + "epoch": 4.918334234721471, + "grad_norm": 4.6875, + "learning_rate": 0.002012673390073623, + "loss": 2.9438, + "mean_token_accuracy": 0.4535948634147644, + "num_tokens": 9297512887.0, + "step": 18188 + }, + { + "epoch": 4.9186046511627906, + "grad_norm": 4.0625, + "learning_rate": 0.00201258989485129, + "loss": 2.6335, + "mean_token_accuracy": 0.4511517286300659, + "num_tokens": 9298014005.0, + "step": 18189 + }, + { + "epoch": 4.918875067604111, + "grad_norm": 3.953125, + "learning_rate": 0.0020125066753891303, + "loss": 2.7276, + "mean_token_accuracy": 0.4370710253715515, + "num_tokens": 9298505042.0, + "step": 18190 + }, + { + "epoch": 4.91914548404543, + "grad_norm": 13.0625, + "learning_rate": 0.002012423731689696, + "loss": 2.3566, + "mean_token_accuracy": 0.5437002778053284, + "num_tokens": 9298968750.0, + "step": 18191 + }, + { + "epoch": 4.91941590048675, + "grad_norm": 7.8125, + "learning_rate": 0.0020123410637555316, + "loss": 2.9611, + "mean_token_accuracy": 0.43394580483436584, + "num_tokens": 9299474016.0, + "step": 18192 + }, + { + "epoch": 4.919686316928069, + "grad_norm": 2.734375, + "learning_rate": 0.002012258671589175, + "loss": 2.9621, + "mean_token_accuracy": 0.43412643671035767, + "num_tokens": 9299953366.0, + "step": 18193 + }, + { + "epoch": 4.919956733369389, + "grad_norm": 3.359375, + "learning_rate": 0.0020121765551931532, + "loss": 2.8034, + "mean_token_accuracy": 0.4405270218849182, + "num_tokens": 9300477611.0, + "step": 18194 + }, + { + "epoch": 4.920227149810708, + "grad_norm": 4.03125, + "learning_rate": 0.002012094714569986, + "loss": 2.8193, + "mean_token_accuracy": 0.4387837052345276, + "num_tokens": 9301001784.0, + "step": 18195 + }, + { + "epoch": 4.9204975662520285, + "grad_norm": 3.75, + "learning_rate": 0.002012013149722185, + "loss": 2.9484, + "mean_token_accuracy": 0.4400791525840759, + "num_tokens": 9301487897.0, + "step": 18196 + }, + { + "epoch": 4.920767982693348, + "grad_norm": 3.984375, + "learning_rate": 0.002011931860652253, + "loss": 2.6906, + "mean_token_accuracy": 0.4505719542503357, + "num_tokens": 9302012042.0, + "step": 18197 + }, + { + "epoch": 4.921038399134668, + "grad_norm": 3.78125, + "learning_rate": 0.002011850847362683, + "loss": 2.6663, + "mean_token_accuracy": 0.43580392003059387, + "num_tokens": 9302536245.0, + "step": 18198 + }, + { + "epoch": 4.921308815575987, + "grad_norm": 4.59375, + "learning_rate": 0.0020117701098559615, + "loss": 2.7329, + "mean_token_accuracy": 0.44336608052253723, + "num_tokens": 9303054089.0, + "step": 18199 + }, + { + "epoch": 4.921579232017306, + "grad_norm": 4.65625, + "learning_rate": 0.002011689648134566, + "loss": 2.91, + "mean_token_accuracy": 0.44095319509506226, + "num_tokens": 9303578371.0, + "step": 18200 + }, + { + "epoch": 4.921849648458626, + "grad_norm": 5.5, + "learning_rate": 0.002011609462200964, + "loss": 2.9656, + "mean_token_accuracy": 0.43884286284446716, + "num_tokens": 9304102520.0, + "step": 18201 + }, + { + "epoch": 4.922120064899946, + "grad_norm": 4.46875, + "learning_rate": 0.0020115295520576177, + "loss": 2.5951, + "mean_token_accuracy": 0.45418134331703186, + "num_tokens": 9304589975.0, + "step": 18202 + }, + { + "epoch": 4.9223904813412656, + "grad_norm": 3.96875, + "learning_rate": 0.002011449917706978, + "loss": 2.6606, + "mean_token_accuracy": 0.4370323717594147, + "num_tokens": 9305114112.0, + "step": 18203 + }, + { + "epoch": 4.922660897782585, + "grad_norm": 3.65625, + "learning_rate": 0.0020113705591514887, + "loss": 2.872, + "mean_token_accuracy": 0.45251139998435974, + "num_tokens": 9305611598.0, + "step": 18204 + }, + { + "epoch": 4.922931314223905, + "grad_norm": 4.59375, + "learning_rate": 0.002011291476393585, + "loss": 2.8178, + "mean_token_accuracy": 0.44646748900413513, + "num_tokens": 9306112381.0, + "step": 18205 + }, + { + "epoch": 4.923201730665225, + "grad_norm": 4.125, + "learning_rate": 0.002011212669435691, + "loss": 2.5469, + "mean_token_accuracy": 0.4670543372631073, + "num_tokens": 9306636602.0, + "step": 18206 + }, + { + "epoch": 4.923472147106544, + "grad_norm": 3.59375, + "learning_rate": 0.0020111341382802275, + "loss": 2.7471, + "mean_token_accuracy": 0.4541182518005371, + "num_tokens": 9307119324.0, + "step": 18207 + }, + { + "epoch": 4.923742563547863, + "grad_norm": 4.28125, + "learning_rate": 0.0020110558829296043, + "loss": 2.7597, + "mean_token_accuracy": 0.42976316809654236, + "num_tokens": 9307643604.0, + "step": 18208 + }, + { + "epoch": 4.924012979989183, + "grad_norm": 4.03125, + "learning_rate": 0.0020109779033862206, + "loss": 2.5862, + "mean_token_accuracy": 0.46017980575561523, + "num_tokens": 9308160762.0, + "step": 18209 + }, + { + "epoch": 4.924283396430503, + "grad_norm": 3.578125, + "learning_rate": 0.0020109001996524694, + "loss": 2.7247, + "mean_token_accuracy": 0.4547705054283142, + "num_tokens": 9308685020.0, + "step": 18210 + }, + { + "epoch": 4.924553812871823, + "grad_norm": 28.25, + "learning_rate": 0.002010822771730735, + "loss": 2.5597, + "mean_token_accuracy": 0.4271663427352905, + "num_tokens": 9309209197.0, + "step": 18211 + }, + { + "epoch": 4.924824229313142, + "grad_norm": 27.625, + "learning_rate": 0.0020107456196233942, + "loss": 2.5619, + "mean_token_accuracy": 0.5007631778717041, + "num_tokens": 9309733474.0, + "step": 18212 + }, + { + "epoch": 4.925094645754462, + "grad_norm": 6.625, + "learning_rate": 0.002010668743332814, + "loss": 2.787, + "mean_token_accuracy": 0.43666353821754456, + "num_tokens": 9310257679.0, + "step": 18213 + }, + { + "epoch": 4.925365062195781, + "grad_norm": 3.453125, + "learning_rate": 0.002010592142861351, + "loss": 2.7768, + "mean_token_accuracy": 0.4300243854522705, + "num_tokens": 9310781877.0, + "step": 18214 + }, + { + "epoch": 4.925635478637101, + "grad_norm": 20.125, + "learning_rate": 0.0020105158182113577, + "loss": 2.5727, + "mean_token_accuracy": 0.47693392634391785, + "num_tokens": 9311306037.0, + "step": 18215 + }, + { + "epoch": 4.9259058950784205, + "grad_norm": 5.75, + "learning_rate": 0.0020104397693851764, + "loss": 2.7144, + "mean_token_accuracy": 0.4311808943748474, + "num_tokens": 9311773976.0, + "step": 18216 + }, + { + "epoch": 4.926176311519741, + "grad_norm": 3.625, + "learning_rate": 0.0020103639963851387, + "loss": 2.8234, + "mean_token_accuracy": 0.43654415011405945, + "num_tokens": 9312243317.0, + "step": 18217 + }, + { + "epoch": 4.92644672796106, + "grad_norm": 3.53125, + "learning_rate": 0.0020102884992135705, + "loss": 2.8318, + "mean_token_accuracy": 0.4466588497161865, + "num_tokens": 9312767495.0, + "step": 18218 + }, + { + "epoch": 4.92671714440238, + "grad_norm": 3.671875, + "learning_rate": 0.0020102132778727893, + "loss": 2.7862, + "mean_token_accuracy": 0.43751564621925354, + "num_tokens": 9313291670.0, + "step": 18219 + }, + { + "epoch": 4.926987560843699, + "grad_norm": 7.5625, + "learning_rate": 0.0020101383323651006, + "loss": 2.5589, + "mean_token_accuracy": 0.4798002243041992, + "num_tokens": 9313815950.0, + "step": 18220 + }, + { + "epoch": 4.927257977285019, + "grad_norm": 2.875, + "learning_rate": 0.0020100636626928063, + "loss": 2.7763, + "mean_token_accuracy": 0.4525349736213684, + "num_tokens": 9314340224.0, + "step": 18221 + }, + { + "epoch": 4.927528393726338, + "grad_norm": 3.109375, + "learning_rate": 0.0020099892688581955, + "loss": 2.8448, + "mean_token_accuracy": 0.4337533712387085, + "num_tokens": 9314864502.0, + "step": 18222 + }, + { + "epoch": 4.927798810167658, + "grad_norm": 3.796875, + "learning_rate": 0.002009915150863552, + "loss": 2.9397, + "mean_token_accuracy": 0.446186363697052, + "num_tokens": 9315388778.0, + "step": 18223 + }, + { + "epoch": 4.928069226608978, + "grad_norm": 4.40625, + "learning_rate": 0.00200984130871115, + "loss": 2.8424, + "mean_token_accuracy": 0.44714105129241943, + "num_tokens": 9315913001.0, + "step": 18224 + }, + { + "epoch": 4.928339643050298, + "grad_norm": 4.4375, + "learning_rate": 0.0020097677424032556, + "loss": 2.7431, + "mean_token_accuracy": 0.42960530519485474, + "num_tokens": 9316437164.0, + "step": 18225 + }, + { + "epoch": 4.928610059491617, + "grad_norm": 3.6875, + "learning_rate": 0.0020096944519421246, + "loss": 2.6609, + "mean_token_accuracy": 0.4411964416503906, + "num_tokens": 9316961398.0, + "step": 18226 + }, + { + "epoch": 4.928880475932937, + "grad_norm": 3.8125, + "learning_rate": 0.0020096214373300073, + "loss": 2.6141, + "mean_token_accuracy": 0.4536423981189728, + "num_tokens": 9317485452.0, + "step": 18227 + }, + { + "epoch": 4.929150892374256, + "grad_norm": 3.4375, + "learning_rate": 0.0020095486985691417, + "loss": 2.7368, + "mean_token_accuracy": 0.45022010803222656, + "num_tokens": 9318009738.0, + "step": 18228 + }, + { + "epoch": 4.929421308815576, + "grad_norm": 4.15625, + "learning_rate": 0.002009476235661763, + "loss": 2.8767, + "mean_token_accuracy": 0.44923073053359985, + "num_tokens": 9318495260.0, + "step": 18229 + }, + { + "epoch": 4.9296917252568955, + "grad_norm": 4.25, + "learning_rate": 0.002009404048610092, + "loss": 2.7036, + "mean_token_accuracy": 0.449665367603302, + "num_tokens": 9319019532.0, + "step": 18230 + }, + { + "epoch": 4.929962141698216, + "grad_norm": 62.0, + "learning_rate": 0.0020093321374163443, + "loss": 3.1067, + "mean_token_accuracy": 0.4312198758125305, + "num_tokens": 9319543727.0, + "step": 18231 + }, + { + "epoch": 4.930232558139535, + "grad_norm": 6.59375, + "learning_rate": 0.0020092605020827275, + "loss": 2.7506, + "mean_token_accuracy": 0.44744807481765747, + "num_tokens": 9320067913.0, + "step": 18232 + }, + { + "epoch": 4.930502974580855, + "grad_norm": 3.3125, + "learning_rate": 0.002009189142611437, + "loss": 2.7999, + "mean_token_accuracy": 0.44339054822921753, + "num_tokens": 9320518934.0, + "step": 18233 + }, + { + "epoch": 4.930773391022174, + "grad_norm": 3.671875, + "learning_rate": 0.0020091180590046647, + "loss": 2.9117, + "mean_token_accuracy": 0.42107027769088745, + "num_tokens": 9321043088.0, + "step": 18234 + }, + { + "epoch": 4.931043807463494, + "grad_norm": 3.046875, + "learning_rate": 0.0020090472512645915, + "loss": 2.7628, + "mean_token_accuracy": 0.4354017674922943, + "num_tokens": 9321567293.0, + "step": 18235 + }, + { + "epoch": 4.931314223904813, + "grad_norm": 4.53125, + "learning_rate": 0.0020089767193933877, + "loss": 2.722, + "mean_token_accuracy": 0.4451230764389038, + "num_tokens": 9322038908.0, + "step": 18236 + }, + { + "epoch": 4.931584640346133, + "grad_norm": 4.0625, + "learning_rate": 0.00200890646339322, + "loss": 2.7037, + "mean_token_accuracy": 0.45581603050231934, + "num_tokens": 9322560201.0, + "step": 18237 + }, + { + "epoch": 4.931855056787453, + "grad_norm": 4.78125, + "learning_rate": 0.002008836483266243, + "loss": 2.8422, + "mean_token_accuracy": 0.44795018434524536, + "num_tokens": 9323072558.0, + "step": 18238 + }, + { + "epoch": 4.932125473228773, + "grad_norm": 3.765625, + "learning_rate": 0.002008766779014604, + "loss": 2.6975, + "mean_token_accuracy": 0.44804564118385315, + "num_tokens": 9323596816.0, + "step": 18239 + }, + { + "epoch": 4.932395889670092, + "grad_norm": 3.5, + "learning_rate": 0.0020086973506404414, + "loss": 2.6815, + "mean_token_accuracy": 0.4681971073150635, + "num_tokens": 9324066383.0, + "step": 18240 + }, + { + "epoch": 4.932666306111411, + "grad_norm": 5.03125, + "learning_rate": 0.002008628198145886, + "loss": 2.9124, + "mean_token_accuracy": 0.4265408515930176, + "num_tokens": 9324590569.0, + "step": 18241 + }, + { + "epoch": 4.932936722552731, + "grad_norm": 4.90625, + "learning_rate": 0.0020085593215330598, + "loss": 2.6414, + "mean_token_accuracy": 0.4630628824234009, + "num_tokens": 9325084751.0, + "step": 18242 + }, + { + "epoch": 4.933207138994051, + "grad_norm": 2.96875, + "learning_rate": 0.002008490720804075, + "loss": 2.6658, + "mean_token_accuracy": 0.45085424184799194, + "num_tokens": 9325583223.0, + "step": 18243 + }, + { + "epoch": 4.9334775554353705, + "grad_norm": 4.4375, + "learning_rate": 0.002008422395961037, + "loss": 2.7196, + "mean_token_accuracy": 0.46717342734336853, + "num_tokens": 9326047195.0, + "step": 18244 + }, + { + "epoch": 4.93374797187669, + "grad_norm": 3.875, + "learning_rate": 0.002008354347006043, + "loss": 2.8115, + "mean_token_accuracy": 0.438535213470459, + "num_tokens": 9326571335.0, + "step": 18245 + }, + { + "epoch": 4.93401838831801, + "grad_norm": 4.28125, + "learning_rate": 0.00200828657394118, + "loss": 2.7406, + "mean_token_accuracy": 0.4428897500038147, + "num_tokens": 9327095505.0, + "step": 18246 + }, + { + "epoch": 4.93428880475933, + "grad_norm": 3.921875, + "learning_rate": 0.002008219076768529, + "loss": 2.8648, + "mean_token_accuracy": 0.43801194429397583, + "num_tokens": 9327595619.0, + "step": 18247 + }, + { + "epoch": 4.934559221200649, + "grad_norm": 4.34375, + "learning_rate": 0.002008151855490158, + "loss": 2.9141, + "mean_token_accuracy": 0.442086398601532, + "num_tokens": 9328076943.0, + "step": 18248 + }, + { + "epoch": 4.934829637641968, + "grad_norm": 3.453125, + "learning_rate": 0.0020080849101081323, + "loss": 2.6647, + "mean_token_accuracy": 0.46395254135131836, + "num_tokens": 9328600986.0, + "step": 18249 + }, + { + "epoch": 4.935100054083288, + "grad_norm": 3.765625, + "learning_rate": 0.0020080182406245054, + "loss": 2.6055, + "mean_token_accuracy": 0.444776713848114, + "num_tokens": 9329077015.0, + "step": 18250 + }, + { + "epoch": 4.9353704705246075, + "grad_norm": 128.0, + "learning_rate": 0.002007951847041322, + "loss": 3.0115, + "mean_token_accuracy": 0.4584559500217438, + "num_tokens": 9329601280.0, + "step": 18251 + }, + { + "epoch": 4.935640886965928, + "grad_norm": 6.75, + "learning_rate": 0.0020078857293606197, + "loss": 2.7344, + "mean_token_accuracy": 0.4396611750125885, + "num_tokens": 9330125342.0, + "step": 18252 + }, + { + "epoch": 4.935911303407247, + "grad_norm": 3.390625, + "learning_rate": 0.0020078198875844275, + "loss": 2.5786, + "mean_token_accuracy": 0.46605780720710754, + "num_tokens": 9330625692.0, + "step": 18253 + }, + { + "epoch": 4.936181719848567, + "grad_norm": 3.796875, + "learning_rate": 0.002007754321714766, + "loss": 2.7012, + "mean_token_accuracy": 0.4183501601219177, + "num_tokens": 9331149775.0, + "step": 18254 + }, + { + "epoch": 4.936452136289886, + "grad_norm": 4.75, + "learning_rate": 0.0020076890317536466, + "loss": 2.7154, + "mean_token_accuracy": 0.4563750624656677, + "num_tokens": 9331674009.0, + "step": 18255 + }, + { + "epoch": 4.936722552731206, + "grad_norm": 3.796875, + "learning_rate": 0.002007624017703071, + "loss": 2.7561, + "mean_token_accuracy": 0.41878026723861694, + "num_tokens": 9332198192.0, + "step": 18256 + }, + { + "epoch": 4.936992969172525, + "grad_norm": 4.78125, + "learning_rate": 0.002007559279565036, + "loss": 2.73, + "mean_token_accuracy": 0.4643828272819519, + "num_tokens": 9332722428.0, + "step": 18257 + }, + { + "epoch": 4.9372633856138455, + "grad_norm": 28.25, + "learning_rate": 0.002007494817341528, + "loss": 2.5981, + "mean_token_accuracy": 0.4378000497817993, + "num_tokens": 9333246706.0, + "step": 18258 + }, + { + "epoch": 4.937533802055165, + "grad_norm": 4.5, + "learning_rate": 0.0020074306310345234, + "loss": 2.7612, + "mean_token_accuracy": 0.43422526121139526, + "num_tokens": 9333770728.0, + "step": 18259 + }, + { + "epoch": 4.937804218496485, + "grad_norm": 6.5625, + "learning_rate": 0.002007366720645992, + "loss": 2.7878, + "mean_token_accuracy": 0.47624409198760986, + "num_tokens": 9334294856.0, + "step": 18260 + }, + { + "epoch": 4.938074634937804, + "grad_norm": 3.203125, + "learning_rate": 0.002007303086177896, + "loss": 2.9187, + "mean_token_accuracy": 0.433734655380249, + "num_tokens": 9334819140.0, + "step": 18261 + }, + { + "epoch": 4.938345051379124, + "grad_norm": 4.4375, + "learning_rate": 0.002007239727632187, + "loss": 2.5616, + "mean_token_accuracy": 0.4611755609512329, + "num_tokens": 9335343404.0, + "step": 18262 + }, + { + "epoch": 4.938615467820443, + "grad_norm": 3.921875, + "learning_rate": 0.00200717664501081, + "loss": 2.7708, + "mean_token_accuracy": 0.4411068260669708, + "num_tokens": 9335867675.0, + "step": 18263 + }, + { + "epoch": 4.938885884261763, + "grad_norm": 3.421875, + "learning_rate": 0.002007113838315699, + "loss": 2.8959, + "mean_token_accuracy": 0.4284024238586426, + "num_tokens": 9336391957.0, + "step": 18264 + }, + { + "epoch": 4.9391563007030825, + "grad_norm": 4.03125, + "learning_rate": 0.0020070513075487817, + "loss": 2.8779, + "mean_token_accuracy": 0.4407320022583008, + "num_tokens": 9336916238.0, + "step": 18265 + }, + { + "epoch": 4.939426717144403, + "grad_norm": 3.609375, + "learning_rate": 0.002006989052711977, + "loss": 2.5702, + "mean_token_accuracy": 0.44360703229904175, + "num_tokens": 9337440429.0, + "step": 18266 + }, + { + "epoch": 4.939697133585722, + "grad_norm": 4.5625, + "learning_rate": 0.0020069270738071947, + "loss": 2.7671, + "mean_token_accuracy": 0.46650567650794983, + "num_tokens": 9337955954.0, + "step": 18267 + }, + { + "epoch": 4.939967550027042, + "grad_norm": 3.75, + "learning_rate": 0.002006865370836337, + "loss": 2.8045, + "mean_token_accuracy": 0.4485219717025757, + "num_tokens": 9338480195.0, + "step": 18268 + }, + { + "epoch": 4.940237966468361, + "grad_norm": 4.6875, + "learning_rate": 0.0020068039438012964, + "loss": 2.8397, + "mean_token_accuracy": 0.43982625007629395, + "num_tokens": 9339003441.0, + "step": 18269 + }, + { + "epoch": 4.940508382909681, + "grad_norm": 3.328125, + "learning_rate": 0.0020067427927039573, + "loss": 2.6506, + "mean_token_accuracy": 0.4586288332939148, + "num_tokens": 9339515622.0, + "step": 18270 + }, + { + "epoch": 4.940778799351, + "grad_norm": 74.0, + "learning_rate": 0.0020066819175461976, + "loss": 4.4312, + "mean_token_accuracy": 0.30074310302734375, + "num_tokens": 9339986195.0, + "step": 18271 + }, + { + "epoch": 4.9410492157923205, + "grad_norm": 7.25, + "learning_rate": 0.0020066213183298847, + "loss": 2.9409, + "mean_token_accuracy": 0.4318351745605469, + "num_tokens": 9340510397.0, + "step": 18272 + }, + { + "epoch": 4.94131963223364, + "grad_norm": 4.15625, + "learning_rate": 0.002006560995056877, + "loss": 2.6499, + "mean_token_accuracy": 0.4379720687866211, + "num_tokens": 9341034675.0, + "step": 18273 + }, + { + "epoch": 4.94159004867496, + "grad_norm": 3.28125, + "learning_rate": 0.002006500947729026, + "loss": 2.7081, + "mean_token_accuracy": 0.4473256766796112, + "num_tokens": 9341529002.0, + "step": 18274 + }, + { + "epoch": 4.941860465116279, + "grad_norm": 4.34375, + "learning_rate": 0.002006441176348174, + "loss": 2.6768, + "mean_token_accuracy": 0.454245001077652, + "num_tokens": 9342053091.0, + "step": 18275 + }, + { + "epoch": 4.942130881557599, + "grad_norm": 3.75, + "learning_rate": 0.002006381680916156, + "loss": 2.7879, + "mean_token_accuracy": 0.4548148512840271, + "num_tokens": 9342524415.0, + "step": 18276 + }, + { + "epoch": 4.942401297998918, + "grad_norm": 3.921875, + "learning_rate": 0.0020063224614347954, + "loss": 2.674, + "mean_token_accuracy": 0.4540591537952423, + "num_tokens": 9343045240.0, + "step": 18277 + }, + { + "epoch": 4.942671714440238, + "grad_norm": 4.21875, + "learning_rate": 0.0020062635179059107, + "loss": 2.7609, + "mean_token_accuracy": 0.44124722480773926, + "num_tokens": 9343569498.0, + "step": 18278 + }, + { + "epoch": 4.9429421308815575, + "grad_norm": 3.375, + "learning_rate": 0.0020062048503313103, + "loss": 2.786, + "mean_token_accuracy": 0.45345914363861084, + "num_tokens": 9344093630.0, + "step": 18279 + }, + { + "epoch": 4.943212547322878, + "grad_norm": 3.765625, + "learning_rate": 0.0020061464587127938, + "loss": 2.8956, + "mean_token_accuracy": 0.42744287848472595, + "num_tokens": 9344617889.0, + "step": 18280 + }, + { + "epoch": 4.943482963764197, + "grad_norm": 3.765625, + "learning_rate": 0.0020060883430521544, + "loss": 2.886, + "mean_token_accuracy": 0.4272410273551941, + "num_tokens": 9345142128.0, + "step": 18281 + }, + { + "epoch": 4.943753380205516, + "grad_norm": 3.4375, + "learning_rate": 0.0020060305033511726, + "loss": 2.6856, + "mean_token_accuracy": 0.43565016984939575, + "num_tokens": 9345666236.0, + "step": 18282 + }, + { + "epoch": 4.944023796646836, + "grad_norm": 3.28125, + "learning_rate": 0.0020059729396116246, + "loss": 2.7677, + "mean_token_accuracy": 0.4298999011516571, + "num_tokens": 9346190466.0, + "step": 18283 + }, + { + "epoch": 4.944294213088156, + "grad_norm": 3.296875, + "learning_rate": 0.0020059156518352768, + "loss": 2.6758, + "mean_token_accuracy": 0.4528893828392029, + "num_tokens": 9346714642.0, + "step": 18284 + }, + { + "epoch": 4.944564629529475, + "grad_norm": 4.15625, + "learning_rate": 0.002005858640023887, + "loss": 2.5708, + "mean_token_accuracy": 0.45377230644226074, + "num_tokens": 9347238749.0, + "step": 18285 + }, + { + "epoch": 4.944835045970795, + "grad_norm": 3.453125, + "learning_rate": 0.0020058019041792035, + "loss": 2.819, + "mean_token_accuracy": 0.44255751371383667, + "num_tokens": 9347762946.0, + "step": 18286 + }, + { + "epoch": 4.945105462412115, + "grad_norm": 3.640625, + "learning_rate": 0.0020057454443029683, + "loss": 2.7955, + "mean_token_accuracy": 0.4466966688632965, + "num_tokens": 9348231755.0, + "step": 18287 + }, + { + "epoch": 4.945375878853435, + "grad_norm": 3.59375, + "learning_rate": 0.0020056892603969125, + "loss": 2.6437, + "mean_token_accuracy": 0.45993632078170776, + "num_tokens": 9348698410.0, + "step": 18288 + }, + { + "epoch": 4.945646295294754, + "grad_norm": 3.875, + "learning_rate": 0.0020056333524627614, + "loss": 2.6654, + "mean_token_accuracy": 0.45711785554885864, + "num_tokens": 9349166048.0, + "step": 18289 + }, + { + "epoch": 4.945916711736073, + "grad_norm": 4.5625, + "learning_rate": 0.00200557772050223, + "loss": 2.9237, + "mean_token_accuracy": 0.4378230571746826, + "num_tokens": 9349678850.0, + "step": 18290 + }, + { + "epoch": 4.946187128177393, + "grad_norm": 45.0, + "learning_rate": 0.0020055223645170247, + "loss": 3.6638, + "mean_token_accuracy": 0.3392246663570404, + "num_tokens": 9350203069.0, + "step": 18291 + }, + { + "epoch": 4.9464575446187125, + "grad_norm": 9.25, + "learning_rate": 0.002005467284508844, + "loss": 2.7218, + "mean_token_accuracy": 0.4544646143913269, + "num_tokens": 9350727342.0, + "step": 18292 + }, + { + "epoch": 4.9467279610600325, + "grad_norm": 4.9375, + "learning_rate": 0.0020054124804793788, + "loss": 2.5123, + "mean_token_accuracy": 0.4592171311378479, + "num_tokens": 9351191605.0, + "step": 18293 + }, + { + "epoch": 4.946998377501352, + "grad_norm": 30.375, + "learning_rate": 0.0020053579524303097, + "loss": 2.6863, + "mean_token_accuracy": 0.4516335725784302, + "num_tokens": 9351715600.0, + "step": 18294 + }, + { + "epoch": 4.947268793942672, + "grad_norm": 5.09375, + "learning_rate": 0.0020053037003633107, + "loss": 2.7309, + "mean_token_accuracy": 0.436423122882843, + "num_tokens": 9352239699.0, + "step": 18295 + }, + { + "epoch": 4.947539210383991, + "grad_norm": 3.109375, + "learning_rate": 0.0020052497242800445, + "loss": 2.8138, + "mean_token_accuracy": 0.437104195356369, + "num_tokens": 9352763912.0, + "step": 18296 + }, + { + "epoch": 4.947809626825311, + "grad_norm": 3.234375, + "learning_rate": 0.0020051960241821705, + "loss": 2.8586, + "mean_token_accuracy": 0.47547656297683716, + "num_tokens": 9353223180.0, + "step": 18297 + }, + { + "epoch": 4.94808004326663, + "grad_norm": 3.40625, + "learning_rate": 0.002005142600071333, + "loss": 2.7521, + "mean_token_accuracy": 0.44809776544570923, + "num_tokens": 9353747355.0, + "step": 18298 + }, + { + "epoch": 4.94835045970795, + "grad_norm": 3.015625, + "learning_rate": 0.002005089451949173, + "loss": 2.7308, + "mean_token_accuracy": 0.4647645056247711, + "num_tokens": 9354271485.0, + "step": 18299 + }, + { + "epoch": 4.94862087614927, + "grad_norm": 4.3125, + "learning_rate": 0.0020050365798173213, + "loss": 2.6149, + "mean_token_accuracy": 0.4373741149902344, + "num_tokens": 9354795702.0, + "step": 18300 + }, + { + "epoch": 4.94889129259059, + "grad_norm": 3.09375, + "learning_rate": 0.0020049839836773988, + "loss": 2.5017, + "mean_token_accuracy": 0.5076894760131836, + "num_tokens": 9355288163.0, + "step": 18301 + }, + { + "epoch": 4.949161709031909, + "grad_norm": 3.109375, + "learning_rate": 0.0020049316635310215, + "loss": 2.7886, + "mean_token_accuracy": 0.43997013568878174, + "num_tokens": 9355812248.0, + "step": 18302 + }, + { + "epoch": 4.949432125473229, + "grad_norm": 4.03125, + "learning_rate": 0.002004879619379793, + "loss": 2.6786, + "mean_token_accuracy": 0.44906705617904663, + "num_tokens": 9356336502.0, + "step": 18303 + }, + { + "epoch": 4.949702541914548, + "grad_norm": 4.28125, + "learning_rate": 0.0020048278512253105, + "loss": 2.9349, + "mean_token_accuracy": 0.430955708026886, + "num_tokens": 9356860677.0, + "step": 18304 + }, + { + "epoch": 4.949972958355868, + "grad_norm": 5.03125, + "learning_rate": 0.0020047763590691626, + "loss": 2.6513, + "mean_token_accuracy": 0.4438462555408478, + "num_tokens": 9357384857.0, + "step": 18305 + }, + { + "epoch": 4.9502433747971875, + "grad_norm": 4.1875, + "learning_rate": 0.0020047251429129295, + "loss": 2.6017, + "mean_token_accuracy": 0.4893438220024109, + "num_tokens": 9357909119.0, + "step": 18306 + }, + { + "epoch": 4.9505137912385075, + "grad_norm": 4.8125, + "learning_rate": 0.0020046742027581823, + "loss": 2.8269, + "mean_token_accuracy": 0.42830753326416016, + "num_tokens": 9358433348.0, + "step": 18307 + }, + { + "epoch": 4.950784207679827, + "grad_norm": 3.53125, + "learning_rate": 0.002004623538606484, + "loss": 2.8378, + "mean_token_accuracy": 0.4345114827156067, + "num_tokens": 9358957582.0, + "step": 18308 + }, + { + "epoch": 4.951054624121147, + "grad_norm": 3.6875, + "learning_rate": 0.0020045731504593897, + "loss": 2.748, + "mean_token_accuracy": 0.4252907335758209, + "num_tokens": 9359481865.0, + "step": 18309 + }, + { + "epoch": 4.951325040562466, + "grad_norm": 3.546875, + "learning_rate": 0.0020045230383184446, + "loss": 2.7794, + "mean_token_accuracy": 0.4602366089820862, + "num_tokens": 9360006084.0, + "step": 18310 + }, + { + "epoch": 4.951595457003786, + "grad_norm": 106.5, + "learning_rate": 0.0020044732021851875, + "loss": 5.4778, + "mean_token_accuracy": 0.25405365228652954, + "num_tokens": 9360488398.0, + "step": 18311 + }, + { + "epoch": 4.951865873445105, + "grad_norm": 6.9375, + "learning_rate": 0.0020044236420611447, + "loss": 2.8397, + "mean_token_accuracy": 0.445203572511673, + "num_tokens": 9360955608.0, + "step": 18312 + }, + { + "epoch": 4.952136289886425, + "grad_norm": 3.609375, + "learning_rate": 0.0020043743579478404, + "loss": 2.7695, + "mean_token_accuracy": 0.43960344791412354, + "num_tokens": 9361476973.0, + "step": 18313 + }, + { + "epoch": 4.952406706327745, + "grad_norm": 3.78125, + "learning_rate": 0.002004325349846785, + "loss": 2.5867, + "mean_token_accuracy": 0.4498554468154907, + "num_tokens": 9362001200.0, + "step": 18314 + }, + { + "epoch": 4.952677122769065, + "grad_norm": 3.765625, + "learning_rate": 0.0020042766177594826, + "loss": 2.945, + "mean_token_accuracy": 0.4256225824356079, + "num_tokens": 9362525483.0, + "step": 18315 + }, + { + "epoch": 4.952947539210384, + "grad_norm": 4.5, + "learning_rate": 0.0020042281616874276, + "loss": 2.8797, + "mean_token_accuracy": 0.44095611572265625, + "num_tokens": 9363049748.0, + "step": 18316 + }, + { + "epoch": 4.953217955651704, + "grad_norm": 4.84375, + "learning_rate": 0.0020041799816321072, + "loss": 2.7028, + "mean_token_accuracy": 0.4256172776222229, + "num_tokens": 9363573986.0, + "step": 18317 + }, + { + "epoch": 4.953488372093023, + "grad_norm": 4.28125, + "learning_rate": 0.0020041320775950012, + "loss": 2.5747, + "mean_token_accuracy": 0.44071078300476074, + "num_tokens": 9364098166.0, + "step": 18318 + }, + { + "epoch": 4.953758788534343, + "grad_norm": 5.0, + "learning_rate": 0.002004084449577577, + "loss": 2.6535, + "mean_token_accuracy": 0.44414693117141724, + "num_tokens": 9364622365.0, + "step": 18319 + }, + { + "epoch": 4.9540292049756625, + "grad_norm": 4.53125, + "learning_rate": 0.0020040370975812983, + "loss": 2.8449, + "mean_token_accuracy": 0.4469445049762726, + "num_tokens": 9365146547.0, + "step": 18320 + }, + { + "epoch": 4.9542996214169825, + "grad_norm": 4.375, + "learning_rate": 0.002003990021607616, + "loss": 2.9391, + "mean_token_accuracy": 0.42611730098724365, + "num_tokens": 9365670740.0, + "step": 18321 + }, + { + "epoch": 4.954570037858302, + "grad_norm": 4.4375, + "learning_rate": 0.002003943221657975, + "loss": 2.6226, + "mean_token_accuracy": 0.45685043931007385, + "num_tokens": 9366195012.0, + "step": 18322 + }, + { + "epoch": 4.954840454299621, + "grad_norm": 4.5625, + "learning_rate": 0.0020038966977338124, + "loss": 2.6116, + "mean_token_accuracy": 0.44067060947418213, + "num_tokens": 9366687727.0, + "step": 18323 + }, + { + "epoch": 4.955110870740941, + "grad_norm": 3.625, + "learning_rate": 0.0020038504498365545, + "loss": 2.6533, + "mean_token_accuracy": 0.4437946677207947, + "num_tokens": 9367211898.0, + "step": 18324 + }, + { + "epoch": 4.955381287182261, + "grad_norm": 3.625, + "learning_rate": 0.0020038044779676217, + "loss": 2.5717, + "mean_token_accuracy": 0.43334463238716125, + "num_tokens": 9367699138.0, + "step": 18325 + }, + { + "epoch": 4.95565170362358, + "grad_norm": 3.046875, + "learning_rate": 0.0020037587821284224, + "loss": 2.7568, + "mean_token_accuracy": 0.4622344970703125, + "num_tokens": 9368124090.0, + "step": 18326 + }, + { + "epoch": 4.9559221200648995, + "grad_norm": 3.953125, + "learning_rate": 0.0020037133623203606, + "loss": 2.957, + "mean_token_accuracy": 0.4257405400276184, + "num_tokens": 9368648300.0, + "step": 18327 + }, + { + "epoch": 4.95619253650622, + "grad_norm": 4.40625, + "learning_rate": 0.0020036682185448286, + "loss": 2.7867, + "mean_token_accuracy": 0.4423454701900482, + "num_tokens": 9369143131.0, + "step": 18328 + }, + { + "epoch": 4.95646295294754, + "grad_norm": 6.71875, + "learning_rate": 0.002003623350803213, + "loss": 2.8512, + "mean_token_accuracy": 0.4274106025695801, + "num_tokens": 9369665934.0, + "step": 18329 + }, + { + "epoch": 4.956733369388859, + "grad_norm": 3.21875, + "learning_rate": 0.0020035787590968893, + "loss": 2.8226, + "mean_token_accuracy": 0.4533880352973938, + "num_tokens": 9370166308.0, + "step": 18330 + }, + { + "epoch": 4.957003785830178, + "grad_norm": 210.0, + "learning_rate": 0.0020035344434272255, + "loss": 6.732, + "mean_token_accuracy": 0.10537399351596832, + "num_tokens": 9370690511.0, + "step": 18331 + }, + { + "epoch": 4.957274202271498, + "grad_norm": 7.53125, + "learning_rate": 0.002003490403795582, + "loss": 2.64, + "mean_token_accuracy": 0.4607474207878113, + "num_tokens": 9371122327.0, + "step": 18332 + }, + { + "epoch": 4.957544618712817, + "grad_norm": 3.75, + "learning_rate": 0.00200344664020331, + "loss": 2.8096, + "mean_token_accuracy": 0.41147881746292114, + "num_tokens": 9371646521.0, + "step": 18333 + }, + { + "epoch": 4.9578150351541375, + "grad_norm": 5.40625, + "learning_rate": 0.002003403152651753, + "loss": 2.5549, + "mean_token_accuracy": 0.49433809518814087, + "num_tokens": 9372170789.0, + "step": 18334 + }, + { + "epoch": 4.958085451595457, + "grad_norm": 4.3125, + "learning_rate": 0.0020033599411422433, + "loss": 2.9269, + "mean_token_accuracy": 0.4192734360694885, + "num_tokens": 9372657591.0, + "step": 18335 + }, + { + "epoch": 4.958355868036777, + "grad_norm": 4.0, + "learning_rate": 0.002003317005676109, + "loss": 2.732, + "mean_token_accuracy": 0.43270987272262573, + "num_tokens": 9373181623.0, + "step": 18336 + }, + { + "epoch": 4.958626284478096, + "grad_norm": 3.734375, + "learning_rate": 0.0020032743462546658, + "loss": 2.9216, + "mean_token_accuracy": 0.43658047914505005, + "num_tokens": 9373705883.0, + "step": 18337 + }, + { + "epoch": 4.958896700919416, + "grad_norm": 3.515625, + "learning_rate": 0.0020032319628792233, + "loss": 2.8078, + "mean_token_accuracy": 0.4426206648349762, + "num_tokens": 9374230139.0, + "step": 18338 + }, + { + "epoch": 4.959167117360735, + "grad_norm": 3.25, + "learning_rate": 0.002003189855551082, + "loss": 2.8163, + "mean_token_accuracy": 0.4561869204044342, + "num_tokens": 9374747731.0, + "step": 18339 + }, + { + "epoch": 4.959437533802055, + "grad_norm": 3.671875, + "learning_rate": 0.002003148024271535, + "loss": 2.7132, + "mean_token_accuracy": 0.45060157775878906, + "num_tokens": 9375215388.0, + "step": 18340 + }, + { + "epoch": 4.9597079502433745, + "grad_norm": 3.84375, + "learning_rate": 0.0020031064690418637, + "loss": 2.7942, + "mean_token_accuracy": 0.43167367577552795, + "num_tokens": 9375739660.0, + "step": 18341 + }, + { + "epoch": 4.959978366684695, + "grad_norm": 37.25, + "learning_rate": 0.0020030651898633434, + "loss": 2.9487, + "mean_token_accuracy": 0.4151618182659149, + "num_tokens": 9376263764.0, + "step": 18342 + }, + { + "epoch": 4.960248783126014, + "grad_norm": 5.625, + "learning_rate": 0.002003024186737242, + "loss": 2.8078, + "mean_token_accuracy": 0.4557277262210846, + "num_tokens": 9376767221.0, + "step": 18343 + }, + { + "epoch": 4.960519199567334, + "grad_norm": 4.15625, + "learning_rate": 0.0020029834596648173, + "loss": 2.563, + "mean_token_accuracy": 0.49755263328552246, + "num_tokens": 9377291366.0, + "step": 18344 + }, + { + "epoch": 4.960789616008653, + "grad_norm": 4.625, + "learning_rate": 0.0020029430086473178, + "loss": 2.5025, + "mean_token_accuracy": 0.48907554149627686, + "num_tokens": 9377815564.0, + "step": 18345 + }, + { + "epoch": 4.961060032449973, + "grad_norm": 3.015625, + "learning_rate": 0.002002902833685986, + "loss": 2.7026, + "mean_token_accuracy": 0.46696633100509644, + "num_tokens": 9378310875.0, + "step": 18346 + }, + { + "epoch": 4.961330448891292, + "grad_norm": 4.03125, + "learning_rate": 0.002002862934782054, + "loss": 2.8888, + "mean_token_accuracy": 0.4471008777618408, + "num_tokens": 9378815698.0, + "step": 18347 + }, + { + "epoch": 4.9616008653326125, + "grad_norm": 3.78125, + "learning_rate": 0.0020028233119367455, + "loss": 2.8488, + "mean_token_accuracy": 0.4318700432777405, + "num_tokens": 9379339950.0, + "step": 18348 + }, + { + "epoch": 4.961871281773932, + "grad_norm": 3.796875, + "learning_rate": 0.0020027839651512776, + "loss": 2.7306, + "mean_token_accuracy": 0.4359068274497986, + "num_tokens": 9379811299.0, + "step": 18349 + }, + { + "epoch": 4.962141698215252, + "grad_norm": 4.25, + "learning_rate": 0.002002744894426856, + "loss": 2.6489, + "mean_token_accuracy": 0.49093562364578247, + "num_tokens": 9380275045.0, + "step": 18350 + }, + { + "epoch": 4.962412114656571, + "grad_norm": 180.0, + "learning_rate": 0.00200270609976468, + "loss": 6.414, + "mean_token_accuracy": 0.22765395045280457, + "num_tokens": 9380733839.0, + "step": 18351 + }, + { + "epoch": 4.962682531097891, + "grad_norm": 7.9375, + "learning_rate": 0.0020026675811659405, + "loss": 2.8923, + "mean_token_accuracy": 0.42145058512687683, + "num_tokens": 9381258100.0, + "step": 18352 + }, + { + "epoch": 4.96295294753921, + "grad_norm": 2.515625, + "learning_rate": 0.00200262933863182, + "loss": 2.8855, + "mean_token_accuracy": 0.4417174458503723, + "num_tokens": 9381782333.0, + "step": 18353 + }, + { + "epoch": 4.96322336398053, + "grad_norm": 4.34375, + "learning_rate": 0.00200259137216349, + "loss": 2.5859, + "mean_token_accuracy": 0.45787423849105835, + "num_tokens": 9382306460.0, + "step": 18354 + }, + { + "epoch": 4.9634937804218495, + "grad_norm": 3.0625, + "learning_rate": 0.0020025536817621157, + "loss": 2.7065, + "mean_token_accuracy": 0.4744378328323364, + "num_tokens": 9382803446.0, + "step": 18355 + }, + { + "epoch": 4.96376419686317, + "grad_norm": 3.890625, + "learning_rate": 0.0020025162674288553, + "loss": 2.737, + "mean_token_accuracy": 0.44789543747901917, + "num_tokens": 9383327584.0, + "step": 18356 + }, + { + "epoch": 4.964034613304489, + "grad_norm": 3.40625, + "learning_rate": 0.002002479129164855, + "loss": 2.8595, + "mean_token_accuracy": 0.43888387084007263, + "num_tokens": 9383851707.0, + "step": 18357 + }, + { + "epoch": 4.964305029745809, + "grad_norm": 4.40625, + "learning_rate": 0.002002442266971256, + "loss": 2.6427, + "mean_token_accuracy": 0.44445037841796875, + "num_tokens": 9384375886.0, + "step": 18358 + }, + { + "epoch": 4.964575446187128, + "grad_norm": 3.421875, + "learning_rate": 0.002002405680849188, + "loss": 2.5415, + "mean_token_accuracy": 0.4741843342781067, + "num_tokens": 9384861915.0, + "step": 18359 + }, + { + "epoch": 4.964845862628448, + "grad_norm": 4.625, + "learning_rate": 0.0020023693707997736, + "loss": 2.9099, + "mean_token_accuracy": 0.42796894907951355, + "num_tokens": 9385386158.0, + "step": 18360 + }, + { + "epoch": 4.965116279069767, + "grad_norm": 3.59375, + "learning_rate": 0.0020023333368241274, + "loss": 2.7552, + "mean_token_accuracy": 0.4495845139026642, + "num_tokens": 9385910286.0, + "step": 18361 + }, + { + "epoch": 4.9653866955110875, + "grad_norm": 3.78125, + "learning_rate": 0.0020022975789233542, + "loss": 2.6374, + "mean_token_accuracy": 0.45775800943374634, + "num_tokens": 9386434281.0, + "step": 18362 + }, + { + "epoch": 4.965657111952407, + "grad_norm": 3.96875, + "learning_rate": 0.0020022620970985526, + "loss": 2.8155, + "mean_token_accuracy": 0.43662917613983154, + "num_tokens": 9386907459.0, + "step": 18363 + }, + { + "epoch": 4.965927528393726, + "grad_norm": 4.125, + "learning_rate": 0.0020022268913508097, + "loss": 2.819, + "mean_token_accuracy": 0.4407539367675781, + "num_tokens": 9387431636.0, + "step": 18364 + }, + { + "epoch": 4.966197944835046, + "grad_norm": 4.15625, + "learning_rate": 0.002002191961681207, + "loss": 2.646, + "mean_token_accuracy": 0.4445217251777649, + "num_tokens": 9387955837.0, + "step": 18365 + }, + { + "epoch": 4.966468361276366, + "grad_norm": 3.453125, + "learning_rate": 0.0020021573080908155, + "loss": 2.858, + "mean_token_accuracy": 0.4505327045917511, + "num_tokens": 9388480047.0, + "step": 18366 + }, + { + "epoch": 4.966738777717685, + "grad_norm": 4.6875, + "learning_rate": 0.002002122930580699, + "loss": 2.8162, + "mean_token_accuracy": 0.4353749454021454, + "num_tokens": 9389004168.0, + "step": 18367 + }, + { + "epoch": 4.967009194159004, + "grad_norm": 3.71875, + "learning_rate": 0.0020020888291519125, + "loss": 2.8742, + "mean_token_accuracy": 0.4341968297958374, + "num_tokens": 9389527478.0, + "step": 18368 + }, + { + "epoch": 4.9672796106003245, + "grad_norm": 4.375, + "learning_rate": 0.0020020550038055015, + "loss": 2.8331, + "mean_token_accuracy": 0.42769551277160645, + "num_tokens": 9390051674.0, + "step": 18369 + }, + { + "epoch": 4.967550027041645, + "grad_norm": 3.625, + "learning_rate": 0.002002021454542504, + "loss": 2.6207, + "mean_token_accuracy": 0.45152467489242554, + "num_tokens": 9390575927.0, + "step": 18370 + }, + { + "epoch": 4.967820443482964, + "grad_norm": 288.0, + "learning_rate": 0.0020019881813639496, + "loss": 7.4571, + "mean_token_accuracy": 0.09784109890460968, + "num_tokens": 9391051577.0, + "step": 18371 + }, + { + "epoch": 4.968090859924283, + "grad_norm": 8.5, + "learning_rate": 0.0020019551842708587, + "loss": 2.6712, + "mean_token_accuracy": 0.46566182374954224, + "num_tokens": 9391575651.0, + "step": 18372 + }, + { + "epoch": 4.968361276365603, + "grad_norm": 3.34375, + "learning_rate": 0.0020019224632642453, + "loss": 2.7599, + "mean_token_accuracy": 0.44613441824913025, + "num_tokens": 9392099798.0, + "step": 18373 + }, + { + "epoch": 4.968631692806922, + "grad_norm": 3.65625, + "learning_rate": 0.002001890018345112, + "loss": 2.7215, + "mean_token_accuracy": 0.4491388201713562, + "num_tokens": 9392568466.0, + "step": 18374 + }, + { + "epoch": 4.968902109248242, + "grad_norm": 15.4375, + "learning_rate": 0.0020018578495144537, + "loss": 2.7713, + "mean_token_accuracy": 0.45246872305870056, + "num_tokens": 9393092689.0, + "step": 18375 + }, + { + "epoch": 4.969172525689562, + "grad_norm": 4.375, + "learning_rate": 0.0020018259567732596, + "loss": 2.8372, + "mean_token_accuracy": 0.45595788955688477, + "num_tokens": 9393530745.0, + "step": 18376 + }, + { + "epoch": 4.969442942130882, + "grad_norm": 4.03125, + "learning_rate": 0.0020017943401225073, + "loss": 2.6666, + "mean_token_accuracy": 0.4498879015445709, + "num_tokens": 9394054768.0, + "step": 18377 + }, + { + "epoch": 4.969713358572201, + "grad_norm": 3.40625, + "learning_rate": 0.0020017629995631656, + "loss": 2.7131, + "mean_token_accuracy": 0.44828957319259644, + "num_tokens": 9394578910.0, + "step": 18378 + }, + { + "epoch": 4.969983775013521, + "grad_norm": 4.4375, + "learning_rate": 0.0020017319350961976, + "loss": 2.7044, + "mean_token_accuracy": 0.4314204454421997, + "num_tokens": 9395103158.0, + "step": 18379 + }, + { + "epoch": 4.97025419145484, + "grad_norm": 4.9375, + "learning_rate": 0.0020017011467225562, + "loss": 2.686, + "mean_token_accuracy": 0.4613901674747467, + "num_tokens": 9395627306.0, + "step": 18380 + }, + { + "epoch": 4.97052460789616, + "grad_norm": 3.953125, + "learning_rate": 0.002001670634443185, + "loss": 2.5869, + "mean_token_accuracy": 0.44866690039634705, + "num_tokens": 9396151476.0, + "step": 18381 + }, + { + "epoch": 4.970795024337479, + "grad_norm": 3.640625, + "learning_rate": 0.0020016403982590215, + "loss": 2.5687, + "mean_token_accuracy": 0.4527888298034668, + "num_tokens": 9396675758.0, + "step": 18382 + }, + { + "epoch": 4.9710654407787995, + "grad_norm": 4.0, + "learning_rate": 0.0020016104381709943, + "loss": 2.8941, + "mean_token_accuracy": 0.4269410967826843, + "num_tokens": 9397200026.0, + "step": 18383 + }, + { + "epoch": 4.971335857220119, + "grad_norm": 4.21875, + "learning_rate": 0.00200158075418002, + "loss": 2.8706, + "mean_token_accuracy": 0.4413258135318756, + "num_tokens": 9397724252.0, + "step": 18384 + }, + { + "epoch": 4.971606273661439, + "grad_norm": 3.640625, + "learning_rate": 0.002001551346287011, + "loss": 2.5005, + "mean_token_accuracy": 0.46540969610214233, + "num_tokens": 9398248332.0, + "step": 18385 + }, + { + "epoch": 4.971876690102758, + "grad_norm": 3.265625, + "learning_rate": 0.00200152221449287, + "loss": 2.8472, + "mean_token_accuracy": 0.4363483190536499, + "num_tokens": 9398772610.0, + "step": 18386 + }, + { + "epoch": 4.972147106544078, + "grad_norm": 3.765625, + "learning_rate": 0.002001493358798489, + "loss": 2.8451, + "mean_token_accuracy": 0.42993640899658203, + "num_tokens": 9399296777.0, + "step": 18387 + }, + { + "epoch": 4.972417522985397, + "grad_norm": 3.46875, + "learning_rate": 0.002001464779204756, + "loss": 2.7484, + "mean_token_accuracy": 0.4582115113735199, + "num_tokens": 9399820889.0, + "step": 18388 + }, + { + "epoch": 4.972687939426717, + "grad_norm": 3.359375, + "learning_rate": 0.002001436475712546, + "loss": 2.7258, + "mean_token_accuracy": 0.431107759475708, + "num_tokens": 9400345136.0, + "step": 18389 + }, + { + "epoch": 4.972958355868037, + "grad_norm": 3.015625, + "learning_rate": 0.0020014084483227276, + "loss": 2.8422, + "mean_token_accuracy": 0.43902409076690674, + "num_tokens": 9400869409.0, + "step": 18390 + }, + { + "epoch": 4.973228772309357, + "grad_norm": 68.0, + "learning_rate": 0.002001380697036162, + "loss": 3.2086, + "mean_token_accuracy": 0.4230419397354126, + "num_tokens": 9401393593.0, + "step": 18391 + }, + { + "epoch": 4.973499188750676, + "grad_norm": 9.0, + "learning_rate": 0.0020013532218537, + "loss": 2.8903, + "mean_token_accuracy": 0.4429023265838623, + "num_tokens": 9401858449.0, + "step": 18392 + }, + { + "epoch": 4.973769605191996, + "grad_norm": 2.984375, + "learning_rate": 0.0020013260227761836, + "loss": 2.6804, + "mean_token_accuracy": 0.447915256023407, + "num_tokens": 9402369359.0, + "step": 18393 + }, + { + "epoch": 4.974040021633315, + "grad_norm": 3.34375, + "learning_rate": 0.0020012990998044488, + "loss": 2.6733, + "mean_token_accuracy": 0.4700881540775299, + "num_tokens": 9402824751.0, + "step": 18394 + }, + { + "epoch": 4.974310438074635, + "grad_norm": 4.46875, + "learning_rate": 0.00200127245293932, + "loss": 2.8725, + "mean_token_accuracy": 0.4318802058696747, + "num_tokens": 9403348997.0, + "step": 18395 + }, + { + "epoch": 4.974580854515954, + "grad_norm": 5.5, + "learning_rate": 0.0020012460821816165, + "loss": 2.7007, + "mean_token_accuracy": 0.43970024585723877, + "num_tokens": 9403873140.0, + "step": 18396 + }, + { + "epoch": 4.9748512709572745, + "grad_norm": 3.515625, + "learning_rate": 0.0020012199875321475, + "loss": 2.6966, + "mean_token_accuracy": 0.4536578059196472, + "num_tokens": 9404397426.0, + "step": 18397 + }, + { + "epoch": 4.975121687398594, + "grad_norm": 3.46875, + "learning_rate": 0.0020011941689917117, + "loss": 2.7401, + "mean_token_accuracy": 0.4585685431957245, + "num_tokens": 9404921690.0, + "step": 18398 + }, + { + "epoch": 4.975392103839914, + "grad_norm": 3.421875, + "learning_rate": 0.002001168626561103, + "loss": 2.5474, + "mean_token_accuracy": 0.4466745853424072, + "num_tokens": 9405445738.0, + "step": 18399 + }, + { + "epoch": 4.975662520281233, + "grad_norm": 3.21875, + "learning_rate": 0.002001143360241105, + "loss": 2.7883, + "mean_token_accuracy": 0.466019868850708, + "num_tokens": 9405885275.0, + "step": 18400 + }, + { + "epoch": 4.975932936722553, + "grad_norm": 3.234375, + "learning_rate": 0.0020011183700324926, + "loss": 2.7342, + "mean_token_accuracy": 0.4538313150405884, + "num_tokens": 9406409553.0, + "step": 18401 + }, + { + "epoch": 4.976203353163872, + "grad_norm": 3.703125, + "learning_rate": 0.002001093655936033, + "loss": 2.7427, + "mean_token_accuracy": 0.4252052307128906, + "num_tokens": 9406933809.0, + "step": 18402 + }, + { + "epoch": 4.976473769605192, + "grad_norm": 4.0625, + "learning_rate": 0.002001069217952483, + "loss": 2.6004, + "mean_token_accuracy": 0.45280444622039795, + "num_tokens": 9407458031.0, + "step": 18403 + }, + { + "epoch": 4.976744186046512, + "grad_norm": 3.5, + "learning_rate": 0.0020010450560825946, + "loss": 2.8559, + "mean_token_accuracy": 0.4287378191947937, + "num_tokens": 9407976070.0, + "step": 18404 + }, + { + "epoch": 4.977014602487831, + "grad_norm": 4.84375, + "learning_rate": 0.002001021170327108, + "loss": 2.5501, + "mean_token_accuracy": 0.4597226679325104, + "num_tokens": 9408500281.0, + "step": 18405 + }, + { + "epoch": 4.977285018929151, + "grad_norm": 3.265625, + "learning_rate": 0.0020009975606867553, + "loss": 2.6932, + "mean_token_accuracy": 0.4608217477798462, + "num_tokens": 9409024549.0, + "step": 18406 + }, + { + "epoch": 4.977555435370471, + "grad_norm": 4.375, + "learning_rate": 0.002000974227162263, + "loss": 2.7308, + "mean_token_accuracy": 0.44742172956466675, + "num_tokens": 9409531854.0, + "step": 18407 + }, + { + "epoch": 4.97782585181179, + "grad_norm": 3.796875, + "learning_rate": 0.0020009511697543446, + "loss": 2.7386, + "mean_token_accuracy": 0.46282511949539185, + "num_tokens": 9409991779.0, + "step": 18408 + }, + { + "epoch": 4.978096268253109, + "grad_norm": 3.890625, + "learning_rate": 0.002000928388463709, + "loss": 2.8725, + "mean_token_accuracy": 0.4416216015815735, + "num_tokens": 9410481491.0, + "step": 18409 + }, + { + "epoch": 4.9783666846944294, + "grad_norm": 4.21875, + "learning_rate": 0.002000905883291055, + "loss": 2.7757, + "mean_token_accuracy": 0.44834694266319275, + "num_tokens": 9411000137.0, + "step": 18410 + }, + { + "epoch": 4.9786371011357495, + "grad_norm": 44.25, + "learning_rate": 0.002000883654237074, + "loss": 3.685, + "mean_token_accuracy": 0.2939138412475586, + "num_tokens": 9411499245.0, + "step": 18411 + }, + { + "epoch": 4.978907517577069, + "grad_norm": 8.375, + "learning_rate": 0.0020008617013024467, + "loss": 2.6864, + "mean_token_accuracy": 0.4327183663845062, + "num_tokens": 9412023526.0, + "step": 18412 + }, + { + "epoch": 4.979177934018388, + "grad_norm": 3.28125, + "learning_rate": 0.0020008400244878465, + "loss": 2.6688, + "mean_token_accuracy": 0.4321911931037903, + "num_tokens": 9412510224.0, + "step": 18413 + }, + { + "epoch": 4.979448350459708, + "grad_norm": 4.21875, + "learning_rate": 0.0020008186237939394, + "loss": 2.5386, + "mean_token_accuracy": 0.44590306282043457, + "num_tokens": 9413034504.0, + "step": 18414 + }, + { + "epoch": 4.979718766901027, + "grad_norm": 4.15625, + "learning_rate": 0.0020007974992213825, + "loss": 2.9256, + "mean_token_accuracy": 0.44035160541534424, + "num_tokens": 9413558764.0, + "step": 18415 + }, + { + "epoch": 4.979989183342347, + "grad_norm": 4.21875, + "learning_rate": 0.0020007766507708223, + "loss": 2.8265, + "mean_token_accuracy": 0.43089354038238525, + "num_tokens": 9414082869.0, + "step": 18416 + }, + { + "epoch": 4.9802595997836665, + "grad_norm": 4.15625, + "learning_rate": 0.0020007560784429, + "loss": 2.8727, + "mean_token_accuracy": 0.4456970691680908, + "num_tokens": 9414607086.0, + "step": 18417 + }, + { + "epoch": 4.980530016224987, + "grad_norm": 4.9375, + "learning_rate": 0.002000735782238246, + "loss": 2.8651, + "mean_token_accuracy": 0.4373389780521393, + "num_tokens": 9415106193.0, + "step": 18418 + }, + { + "epoch": 4.980800432666306, + "grad_norm": 4.4375, + "learning_rate": 0.002000715762157484, + "loss": 2.7348, + "mean_token_accuracy": 0.4696666896343231, + "num_tokens": 9415566067.0, + "step": 18419 + }, + { + "epoch": 4.981070849107626, + "grad_norm": 38.0, + "learning_rate": 0.002000696018201227, + "loss": 3.046, + "mean_token_accuracy": 0.4143872857093811, + "num_tokens": 9416090314.0, + "step": 18420 + }, + { + "epoch": 4.981341265548945, + "grad_norm": 5.8125, + "learning_rate": 0.002000676550370082, + "loss": 2.7278, + "mean_token_accuracy": 0.4211518168449402, + "num_tokens": 9416601661.0, + "step": 18421 + }, + { + "epoch": 4.981611681990265, + "grad_norm": 3.09375, + "learning_rate": 0.002000657358664645, + "loss": 2.7787, + "mean_token_accuracy": 0.44616222381591797, + "num_tokens": 9417122322.0, + "step": 18422 + }, + { + "epoch": 4.981882098431584, + "grad_norm": 3.140625, + "learning_rate": 0.002000638443085506, + "loss": 2.5286, + "mean_token_accuracy": 0.4689570963382721, + "num_tokens": 9417646493.0, + "step": 18423 + }, + { + "epoch": 4.9821525148729044, + "grad_norm": 4.46875, + "learning_rate": 0.0020006198036332445, + "loss": 3.0407, + "mean_token_accuracy": 0.41949978470802307, + "num_tokens": 9418170771.0, + "step": 18424 + }, + { + "epoch": 4.982422931314224, + "grad_norm": 5.21875, + "learning_rate": 0.002000601440308433, + "loss": 2.9819, + "mean_token_accuracy": 0.41989314556121826, + "num_tokens": 9418695046.0, + "step": 18425 + }, + { + "epoch": 4.982693347755544, + "grad_norm": 3.921875, + "learning_rate": 0.002000583353111635, + "loss": 2.7852, + "mean_token_accuracy": 0.4403999149799347, + "num_tokens": 9419219321.0, + "step": 18426 + }, + { + "epoch": 4.982963764196863, + "grad_norm": 11.0625, + "learning_rate": 0.002000565542043405, + "loss": 2.5781, + "mean_token_accuracy": 0.5180232524871826, + "num_tokens": 9419743590.0, + "step": 18427 + }, + { + "epoch": 4.983234180638183, + "grad_norm": 5.59375, + "learning_rate": 0.0020005480071042902, + "loss": 2.8364, + "mean_token_accuracy": 0.4205363392829895, + "num_tokens": 9420232134.0, + "step": 18428 + }, + { + "epoch": 4.983504597079502, + "grad_norm": 4.5, + "learning_rate": 0.0020005307482948275, + "loss": 2.723, + "mean_token_accuracy": 0.42951852083206177, + "num_tokens": 9420733343.0, + "step": 18429 + }, + { + "epoch": 4.983775013520822, + "grad_norm": 2.953125, + "learning_rate": 0.0020005137656155477, + "loss": 2.8976, + "mean_token_accuracy": 0.4357142448425293, + "num_tokens": 9421257524.0, + "step": 18430 + }, + { + "epoch": 4.9840454299621415, + "grad_norm": 25.125, + "learning_rate": 0.0020004970590669713, + "loss": 2.5966, + "mean_token_accuracy": 0.47490042448043823, + "num_tokens": 9421759944.0, + "step": 18431 + }, + { + "epoch": 4.984315846403462, + "grad_norm": 7.75, + "learning_rate": 0.0020004806286496118, + "loss": 2.7837, + "mean_token_accuracy": 0.43136605620384216, + "num_tokens": 9422284123.0, + "step": 18432 + }, + { + "epoch": 4.984586262844781, + "grad_norm": 3.96875, + "learning_rate": 0.0020004644743639717, + "loss": 2.6598, + "mean_token_accuracy": 0.44462090730667114, + "num_tokens": 9422808388.0, + "step": 18433 + }, + { + "epoch": 4.984856679286101, + "grad_norm": 4.09375, + "learning_rate": 0.0020004485962105464, + "loss": 2.7858, + "mean_token_accuracy": 0.4356977939605713, + "num_tokens": 9423332538.0, + "step": 18434 + }, + { + "epoch": 4.98512709572742, + "grad_norm": 24.0, + "learning_rate": 0.002000432994189825, + "loss": 2.9953, + "mean_token_accuracy": 0.40458524227142334, + "num_tokens": 9423856653.0, + "step": 18435 + }, + { + "epoch": 4.98539751216874, + "grad_norm": 4.75, + "learning_rate": 0.002000417668302286, + "loss": 2.7294, + "mean_token_accuracy": 0.4457587003707886, + "num_tokens": 9424380722.0, + "step": 18436 + }, + { + "epoch": 4.985667928610059, + "grad_norm": 3.390625, + "learning_rate": 0.002000402618548398, + "loss": 2.6892, + "mean_token_accuracy": 0.47396859526634216, + "num_tokens": 9424844545.0, + "step": 18437 + }, + { + "epoch": 4.9859383450513795, + "grad_norm": 3.28125, + "learning_rate": 0.0020003878449286246, + "loss": 2.6506, + "mean_token_accuracy": 0.45421102643013, + "num_tokens": 9425355177.0, + "step": 18438 + }, + { + "epoch": 4.986208761492699, + "grad_norm": 5.21875, + "learning_rate": 0.002000373347443418, + "loss": 2.9735, + "mean_token_accuracy": 0.4010539650917053, + "num_tokens": 9425858159.0, + "step": 18439 + }, + { + "epoch": 4.986479177934019, + "grad_norm": 4.0625, + "learning_rate": 0.0020003591260932224, + "loss": 2.8771, + "mean_token_accuracy": 0.44328394532203674, + "num_tokens": 9426382348.0, + "step": 18440 + }, + { + "epoch": 4.986749594375338, + "grad_norm": 3.90625, + "learning_rate": 0.002000345180878477, + "loss": 2.7689, + "mean_token_accuracy": 0.4377770721912384, + "num_tokens": 9426906499.0, + "step": 18441 + }, + { + "epoch": 4.987020010816658, + "grad_norm": 3.703125, + "learning_rate": 0.002000331511799606, + "loss": 2.7725, + "mean_token_accuracy": 0.44085073471069336, + "num_tokens": 9427343784.0, + "step": 18442 + }, + { + "epoch": 4.987290427257977, + "grad_norm": 3.90625, + "learning_rate": 0.0020003181188570306, + "loss": 2.9346, + "mean_token_accuracy": 0.4571816921234131, + "num_tokens": 9427804303.0, + "step": 18443 + }, + { + "epoch": 4.987560843699297, + "grad_norm": 4.21875, + "learning_rate": 0.002000305002051162, + "loss": 2.6856, + "mean_token_accuracy": 0.4676769971847534, + "num_tokens": 9428299324.0, + "step": 18444 + }, + { + "epoch": 4.9878312601406165, + "grad_norm": 4.03125, + "learning_rate": 0.002000292161382402, + "loss": 2.7952, + "mean_token_accuracy": 0.44403520226478577, + "num_tokens": 9428823471.0, + "step": 18445 + }, + { + "epoch": 4.988101676581936, + "grad_norm": 3.640625, + "learning_rate": 0.0020002795968511455, + "loss": 2.8362, + "mean_token_accuracy": 0.4391740560531616, + "num_tokens": 9429347747.0, + "step": 18446 + }, + { + "epoch": 4.988372093023256, + "grad_norm": 3.703125, + "learning_rate": 0.0020002673084577767, + "loss": 2.8754, + "mean_token_accuracy": 0.4284343719482422, + "num_tokens": 9429871999.0, + "step": 18447 + }, + { + "epoch": 4.988642509464576, + "grad_norm": 3.65625, + "learning_rate": 0.002000255296202674, + "loss": 2.7989, + "mean_token_accuracy": 0.463948518037796, + "num_tokens": 9430332284.0, + "step": 18448 + }, + { + "epoch": 4.988912925905895, + "grad_norm": 3.515625, + "learning_rate": 0.002000243560086206, + "loss": 2.8452, + "mean_token_accuracy": 0.41666221618652344, + "num_tokens": 9430830239.0, + "step": 18449 + }, + { + "epoch": 4.989183342347214, + "grad_norm": 4.09375, + "learning_rate": 0.0020002321001087306, + "loss": 2.8572, + "mean_token_accuracy": 0.44657617807388306, + "num_tokens": 9431339677.0, + "step": 18450 + }, + { + "epoch": 4.989453758788534, + "grad_norm": 43.5, + "learning_rate": 0.002000220916270602, + "loss": 3.133, + "mean_token_accuracy": 0.39573225378990173, + "num_tokens": 9431830044.0, + "step": 18451 + }, + { + "epoch": 4.9897241752298545, + "grad_norm": 8.6875, + "learning_rate": 0.0020002100085721613, + "loss": 2.9607, + "mean_token_accuracy": 0.4207943379878998, + "num_tokens": 9432354240.0, + "step": 18452 + }, + { + "epoch": 4.989994591671174, + "grad_norm": 2.984375, + "learning_rate": 0.002000199377013745, + "loss": 2.7225, + "mean_token_accuracy": 0.4565453827381134, + "num_tokens": 9432828020.0, + "step": 18453 + }, + { + "epoch": 4.990265008112493, + "grad_norm": 3.65625, + "learning_rate": 0.0020001890215956787, + "loss": 2.6053, + "mean_token_accuracy": 0.4554425776004791, + "num_tokens": 9433352305.0, + "step": 18454 + }, + { + "epoch": 4.990535424553813, + "grad_norm": 3.484375, + "learning_rate": 0.0020001789423182794, + "loss": 2.6628, + "mean_token_accuracy": 0.45085975527763367, + "num_tokens": 9433870961.0, + "step": 18455 + }, + { + "epoch": 4.990805840995132, + "grad_norm": 4.4375, + "learning_rate": 0.0020001691391818574, + "loss": 2.518, + "mean_token_accuracy": 0.4669821262359619, + "num_tokens": 9434389588.0, + "step": 18456 + }, + { + "epoch": 4.991076257436452, + "grad_norm": 4.28125, + "learning_rate": 0.002000159612186713, + "loss": 2.9717, + "mean_token_accuracy": 0.43165531754493713, + "num_tokens": 9434869158.0, + "step": 18457 + }, + { + "epoch": 4.991346673877771, + "grad_norm": 3.8125, + "learning_rate": 0.0020001503613331375, + "loss": 2.8054, + "mean_token_accuracy": 0.4447687268257141, + "num_tokens": 9435393322.0, + "step": 18458 + }, + { + "epoch": 4.9916170903190915, + "grad_norm": 4.0625, + "learning_rate": 0.0020001413866214167, + "loss": 2.5573, + "mean_token_accuracy": 0.4617267847061157, + "num_tokens": 9435917423.0, + "step": 18459 + }, + { + "epoch": 4.991887506760411, + "grad_norm": 4.125, + "learning_rate": 0.002000132688051825, + "loss": 2.5892, + "mean_token_accuracy": 0.43308794498443604, + "num_tokens": 9436441515.0, + "step": 18460 + }, + { + "epoch": 4.992157923201731, + "grad_norm": 3.8125, + "learning_rate": 0.0020001242656246285, + "loss": 2.67, + "mean_token_accuracy": 0.45058301091194153, + "num_tokens": 9436965692.0, + "step": 18461 + }, + { + "epoch": 4.99242833964305, + "grad_norm": 3.75, + "learning_rate": 0.0020001161193400874, + "loss": 2.6649, + "mean_token_accuracy": 0.4662356972694397, + "num_tokens": 9437489968.0, + "step": 18462 + }, + { + "epoch": 4.99269875608437, + "grad_norm": 4.46875, + "learning_rate": 0.00200010824919845, + "loss": 2.6957, + "mean_token_accuracy": 0.4503540098667145, + "num_tokens": 9438014183.0, + "step": 18463 + }, + { + "epoch": 4.992969172525689, + "grad_norm": 3.640625, + "learning_rate": 0.002000100655199959, + "loss": 2.7204, + "mean_token_accuracy": 0.4548223316669464, + "num_tokens": 9438488738.0, + "step": 18464 + }, + { + "epoch": 4.993239588967009, + "grad_norm": 3.75, + "learning_rate": 0.002000093337344847, + "loss": 2.7078, + "mean_token_accuracy": 0.4330595135688782, + "num_tokens": 9439012927.0, + "step": 18465 + }, + { + "epoch": 4.993510005408329, + "grad_norm": 3.5625, + "learning_rate": 0.002000086295633338, + "loss": 2.6282, + "mean_token_accuracy": 0.4511028826236725, + "num_tokens": 9439537077.0, + "step": 18466 + }, + { + "epoch": 4.993780421849649, + "grad_norm": 4.375, + "learning_rate": 0.002000079530065648, + "loss": 2.8259, + "mean_token_accuracy": 0.4363460838794708, + "num_tokens": 9440061250.0, + "step": 18467 + }, + { + "epoch": 4.994050838290968, + "grad_norm": 4.84375, + "learning_rate": 0.002000073040641986, + "loss": 2.7987, + "mean_token_accuracy": 0.4584163725376129, + "num_tokens": 9440561238.0, + "step": 18468 + }, + { + "epoch": 4.994321254732288, + "grad_norm": 4.15625, + "learning_rate": 0.00200006682736255, + "loss": 2.8283, + "mean_token_accuracy": 0.44183698296546936, + "num_tokens": 9441085307.0, + "step": 18469 + }, + { + "epoch": 4.994591671173607, + "grad_norm": 4.5, + "learning_rate": 0.0020000608902275303, + "loss": 2.6749, + "mean_token_accuracy": 0.4388004541397095, + "num_tokens": 9441575073.0, + "step": 18470 + }, + { + "epoch": 4.994862087614927, + "grad_norm": 15.25, + "learning_rate": 0.00200005522923711, + "loss": 2.1602, + "mean_token_accuracy": 0.5284894704818726, + "num_tokens": 9442099174.0, + "step": 18471 + }, + { + "epoch": 4.995132504056246, + "grad_norm": 6.25, + "learning_rate": 0.0020000498443914624, + "loss": 2.8398, + "mean_token_accuracy": 0.4345875084400177, + "num_tokens": 9442623338.0, + "step": 18472 + }, + { + "epoch": 4.9954029204975665, + "grad_norm": 3.578125, + "learning_rate": 0.002000044735690752, + "loss": 2.6989, + "mean_token_accuracy": 0.43288806080818176, + "num_tokens": 9443147491.0, + "step": 18473 + }, + { + "epoch": 4.995673336938886, + "grad_norm": 3.421875, + "learning_rate": 0.0020000399031351373, + "loss": 2.8142, + "mean_token_accuracy": 0.4470457434654236, + "num_tokens": 9443671744.0, + "step": 18474 + }, + { + "epoch": 4.995943753380206, + "grad_norm": 3.609375, + "learning_rate": 0.0020000353467247655, + "loss": 2.6043, + "mean_token_accuracy": 0.4576202630996704, + "num_tokens": 9444195939.0, + "step": 18475 + }, + { + "epoch": 4.996214169821525, + "grad_norm": 3.28125, + "learning_rate": 0.002000031066459776, + "loss": 2.7866, + "mean_token_accuracy": 0.4493725001811981, + "num_tokens": 9444720217.0, + "step": 18476 + }, + { + "epoch": 4.996484586262845, + "grad_norm": 3.9375, + "learning_rate": 0.0020000270623403, + "loss": 2.7606, + "mean_token_accuracy": 0.43363046646118164, + "num_tokens": 9445244318.0, + "step": 18477 + }, + { + "epoch": 4.996755002704164, + "grad_norm": 4.15625, + "learning_rate": 0.0020000233343664612, + "loss": 2.508, + "mean_token_accuracy": 0.4525202810764313, + "num_tokens": 9445768531.0, + "step": 18478 + }, + { + "epoch": 4.997025419145484, + "grad_norm": 4.03125, + "learning_rate": 0.002000019882538374, + "loss": 2.9381, + "mean_token_accuracy": 0.4326550364494324, + "num_tokens": 9446280423.0, + "step": 18479 + }, + { + "epoch": 4.997295835586804, + "grad_norm": 4.46875, + "learning_rate": 0.002000016706856144, + "loss": 2.7967, + "mean_token_accuracy": 0.4317006468772888, + "num_tokens": 9446804660.0, + "step": 18480 + }, + { + "epoch": 4.997566252028124, + "grad_norm": 3.203125, + "learning_rate": 0.0020000138073198686, + "loss": 2.6402, + "mean_token_accuracy": 0.46434807777404785, + "num_tokens": 9447328916.0, + "step": 18481 + }, + { + "epoch": 4.997836668469443, + "grad_norm": 4.1875, + "learning_rate": 0.0020000111839296375, + "loss": 2.7401, + "mean_token_accuracy": 0.4628404676914215, + "num_tokens": 9447853122.0, + "step": 18482 + }, + { + "epoch": 4.998107084910763, + "grad_norm": 4.3125, + "learning_rate": 0.0020000088366855296, + "loss": 2.6213, + "mean_token_accuracy": 0.46080586314201355, + "num_tokens": 9448313830.0, + "step": 18483 + }, + { + "epoch": 4.998377501352082, + "grad_norm": 2.96875, + "learning_rate": 0.0020000067655876184, + "loss": 2.7885, + "mean_token_accuracy": 0.4335612952709198, + "num_tokens": 9448838002.0, + "step": 18484 + }, + { + "epoch": 4.998647917793402, + "grad_norm": 4.21875, + "learning_rate": 0.0020000049706359662, + "loss": 2.8638, + "mean_token_accuracy": 0.44396254420280457, + "num_tokens": 9449362207.0, + "step": 18485 + }, + { + "epoch": 4.998918334234721, + "grad_norm": 4.15625, + "learning_rate": 0.0020000034518306295, + "loss": 2.8275, + "mean_token_accuracy": 0.4291003942489624, + "num_tokens": 9449886432.0, + "step": 18486 + }, + { + "epoch": 4.999188750676041, + "grad_norm": 4.1875, + "learning_rate": 0.002000002209171654, + "loss": 2.7472, + "mean_token_accuracy": 0.44577693939208984, + "num_tokens": 9450410600.0, + "step": 18487 + }, + { + "epoch": 4.999459167117361, + "grad_norm": 4.125, + "learning_rate": 0.0020000012426590777, + "loss": 2.761, + "mean_token_accuracy": 0.4533671736717224, + "num_tokens": 9450873892.0, + "step": 18488 + }, + { + "epoch": 4.999729583558681, + "grad_norm": 3.953125, + "learning_rate": 0.0020000005522929306, + "loss": 2.8075, + "mean_token_accuracy": 0.44718268513679504, + "num_tokens": 9451398016.0, + "step": 18489 + }, + { + "epoch": 5.0, + "grad_norm": 4.84375, + "learning_rate": 0.0020000001380732343, + "loss": 2.862, + "mean_token_accuracy": 0.4288099408149719, + "num_tokens": 9451622395.0, + "step": 18490 + }, + { + "epoch": 5.0, + "step": 18490, + "total_flos": 5.519394781646763e+19, + "train_loss": 3.57993624316737, + "train_runtime": 147576.5591, + "train_samples_per_second": 1.002, + "train_steps_per_second": 0.125 + } + ], + "logging_steps": 1, + "max_steps": 18490, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 925, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.519394781646763e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..5257bef --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:735b17fa74b923b85940d83f8c0168f738f22974664b96b4b43fea1cdb3b4557 +size 10616